Print this page
    
10409 ipf sometimes freezes RFC 1323 transfers
Reviewed by: Jason King <jbk@joyent.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/inet/ipf/ip_state.c
          +++ new/usr/src/uts/common/inet/ipf/ip_state.c
   1    1  /*
   2    2   * Copyright (C) 1995-2003 by Darren Reed.
   3    3   *
   4    4   * See the IPFILTER.LICENCE file for details on licencing.
   5    5   *
   6    6   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
   7    7   *
   8    8   * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
   9    9   */
  10   10  
  11   11  #if defined(KERNEL) || defined(_KERNEL)
  12   12  # undef KERNEL
  13   13  # undef _KERNEL
  14   14  # define        KERNEL  1
  15   15  # define        _KERNEL 1
  16   16  #endif
  17   17  #include <sys/errno.h>
  18   18  #include <sys/types.h>
  19   19  #include <sys/param.h>
  20   20  #include <sys/file.h>
  21   21  #if defined(__NetBSD__) && (NetBSD >= 199905) && !defined(IPFILTER_LKM) && \
  22   22      defined(_KERNEL)
  23   23  # include "opt_ipfilter_log.h"
  24   24  #endif
  25   25  #if defined(_KERNEL) && defined(__FreeBSD_version) && \
  26   26      (__FreeBSD_version >= 400000) && !defined(KLD_MODULE)
  27   27  #include "opt_inet6.h"
  28   28  #endif
  29   29  #if !defined(_KERNEL) && !defined(__KERNEL__)
  30   30  # include <stdio.h>
  31   31  # include <stdlib.h>
  32   32  # include <string.h>
  33   33  # define _KERNEL
  34   34  # ifdef __OpenBSD__
  35   35  struct file;
  36   36  # endif
  37   37  # include <sys/uio.h>
  38   38  # undef _KERNEL
  39   39  #endif
  40   40  #if defined(_KERNEL) && (__FreeBSD_version >= 220000)
  41   41  # include <sys/filio.h>
  42   42  # include <sys/fcntl.h>
  43   43  # if (__FreeBSD_version >= 300000) && !defined(IPFILTER_LKM)
  44   44  #  include "opt_ipfilter.h"
  45   45  # endif
  46   46  #else
  47   47  # include <sys/ioctl.h>
  48   48  #endif
  49   49  #include <sys/time.h>
  50   50  #if !defined(linux)
  51   51  # include <sys/protosw.h>
  52   52  #endif
  53   53  #include <sys/socket.h>
  54   54  #if defined(_KERNEL)
  55   55  # include <sys/systm.h>
  56   56  # if !defined(__SVR4) && !defined(__svr4__)
  57   57  #  include <sys/mbuf.h>
  58   58  # endif
  59   59  #endif
  60   60  #if defined(__SVR4) || defined(__svr4__)
  61   61  # include <sys/filio.h>
  62   62  # include <sys/byteorder.h>
  63   63  # ifdef _KERNEL
  64   64  #  include <sys/dditypes.h>
  65   65  # endif
  66   66  # include <sys/stream.h>
  67   67  # include <sys/kmem.h>
  68   68  #endif
  69   69  
  70   70  #include <net/if.h>
  71   71  #ifdef sun
  72   72  # include <net/af.h>
  73   73  #endif
  74   74  #include <net/route.h>
  75   75  #include <netinet/in.h>
  76   76  #include <netinet/in_systm.h>
  77   77  #include <netinet/ip.h>
  78   78  #include <netinet/tcp.h>
  79   79  #if !defined(linux)
  80   80  # include <netinet/ip_var.h>
  81   81  #endif
  82   82  #if !defined(__hpux) && !defined(linux)
  83   83  # include <netinet/tcp_fsm.h>
  84   84  #endif
  85   85  #include <netinet/udp.h>
  86   86  #include <netinet/ip_icmp.h>
  87   87  #include "netinet/ip_compat.h"
  88   88  #include <netinet/tcpip.h>
  89   89  #include "netinet/ip_fil.h"
  90   90  #include "netinet/ip_nat.h"
  91   91  #include "netinet/ip_frag.h"
  92   92  #include "netinet/ip_state.h"
  93   93  #include "netinet/ip_proxy.h"
  94   94  #include "netinet/ipf_stack.h"
  95   95  #ifdef  IPFILTER_SYNC
  96   96  #include "netinet/ip_sync.h"
  97   97  #endif
  98   98  #ifdef  IPFILTER_SCAN
  99   99  #include "netinet/ip_scan.h"
 100  100  #endif
 101  101  #ifdef  USE_INET6
 102  102  #include <netinet/icmp6.h>
 103  103  #endif
 104  104  #if (__FreeBSD_version >= 300000)
 105  105  # include <sys/malloc.h>
 106  106  # if defined(_KERNEL) && !defined(IPFILTER_LKM)
 107  107  #  include <sys/libkern.h>
 108  108  #  include <sys/systm.h>
 109  109  # endif
 110  110  #endif
 111  111  /* END OF INCLUDES */
 112  112  
 113  113  
 114  114  #if !defined(lint)
 115  115  static const char sccsid[] = "@(#)ip_state.c    1.8 6/5/96 (C) 1993-2000 Darren Reed";
 116  116  static const char rcsid[] = "@(#)$Id: ip_state.c,v 2.186.2.36 2005/08/11 19:58:03 darrenr Exp $";
 117  117  #endif
 118  118  
 119  119  #ifdef  USE_INET6
 120  120  static ipstate_t *fr_checkicmp6matchingstate __P((fr_info_t *));
 121  121  #endif
 122  122  static ipstate_t *fr_matchsrcdst __P((fr_info_t *, ipstate_t *, i6addr_t *,
 123  123                                        i6addr_t *, tcphdr_t *, u_32_t));
 124  124  static ipstate_t *fr_checkicmpmatchingstate __P((fr_info_t *));
 125  125  static int fr_state_flush __P((int, int, ipf_stack_t *));
 126  126  static ips_stat_t *fr_statetstats __P((ipf_stack_t *));
 127  127  static int fr_state_remove __P((caddr_t, ipf_stack_t *));
 128  128  static void fr_ipsmove __P((ipstate_t *, u_int, ipf_stack_t *));
 129  129  static int fr_tcpstate __P((fr_info_t *, tcphdr_t *, ipstate_t *));
 130  130  static int fr_tcpoptions __P((fr_info_t *, tcphdr_t *, tcpdata_t *));
 131  131  static ipstate_t *fr_stclone __P((fr_info_t *, tcphdr_t *, ipstate_t *));
 132  132  static void fr_fixinisn __P((fr_info_t *, ipstate_t *));
 133  133  static void fr_fixoutisn __P((fr_info_t *, ipstate_t *));
 134  134  static void fr_checknewisn __P((fr_info_t *, ipstate_t *));
 135  135  static int fr_stateiter __P((ipftoken_t *, ipfgeniter_t *, ipf_stack_t *));
 136  136  
 137  137  int fr_stputent __P((caddr_t, ipf_stack_t *));
 138  138  int fr_stgetent __P((caddr_t, ipf_stack_t *));
 139  139  
 140  140  #define ONE_DAY         IPF_TTLVAL(1 * 86400)   /* 1 day */
 141  141  #define FIVE_DAYS       (5 * ONE_DAY)
 142  142  #define DOUBLE_HASH(x, ifs)     \
 143  143      (((x) + ifs->ifs_ips_seed[(x) % ifs->ifs_fr_statesize]) % ifs->ifs_fr_statesize)
 144  144  
 145  145  
 146  146  /* ------------------------------------------------------------------------ */
 147  147  /* Function:    fr_stateinit                                                */
 148  148  /* Returns:     int - 0 == success, -1 == failure                           */
 149  149  /* Parameters:  ifs - ipf stack instance                                    */
 150  150  /*                                                                          */
 151  151  /* Initialise all the global variables used within the state code.          */
 152  152  /* This action also includes initiailising locks.                           */
 153  153  /* ------------------------------------------------------------------------ */
 154  154  int fr_stateinit(ifs)
 155  155  ipf_stack_t *ifs;
 156  156  {
 157  157  #if defined(NEED_LOCAL_RAND) || !defined(_KERNEL)
 158  158          struct timeval tv;
 159  159  #endif
 160  160          int i;
 161  161  
 162  162          KMALLOCS(ifs->ifs_ips_table, ipstate_t **, 
 163  163                   ifs->ifs_fr_statesize * sizeof(ipstate_t *));
 164  164          if (ifs->ifs_ips_table == NULL)
 165  165                  return -1;
 166  166          bzero((char *)ifs->ifs_ips_table, 
 167  167                ifs->ifs_fr_statesize * sizeof(ipstate_t *));
 168  168  
 169  169          KMALLOCS(ifs->ifs_ips_seed, u_long *,
 170  170                   ifs->ifs_fr_statesize * sizeof(*ifs->ifs_ips_seed));
 171  171          if (ifs->ifs_ips_seed == NULL)
 172  172                  return -2;
 173  173  #if defined(NEED_LOCAL_RAND) || !defined(_KERNEL)
 174  174          tv.tv_sec = 0;
 175  175          GETKTIME(&tv);
 176  176  #endif
 177  177          for (i = 0; i < ifs->ifs_fr_statesize; i++) {
 178  178                  /*
 179  179                   * XXX - ips_seed[X] should be a random number of sorts.
 180  180                   */
 181  181  #if !defined(NEED_LOCAL_RAND) && defined(_KERNEL)
 182  182                  ifs->ifs_ips_seed[i] = ipf_random();
 183  183  #else
 184  184                  ifs->ifs_ips_seed[i] = ((u_long)ifs->ifs_ips_seed + i) *
 185  185                      ifs->ifs_fr_statesize;
 186  186                  ifs->ifs_ips_seed[i] += tv.tv_sec;
 187  187                  ifs->ifs_ips_seed[i] *= (u_long)ifs->ifs_ips_seed;
 188  188                  ifs->ifs_ips_seed[i] ^= 0x5a5aa5a5;
 189  189                  ifs->ifs_ips_seed[i] *= ifs->ifs_fr_statemax;
 190  190  #endif
 191  191          }
 192  192  
 193  193          /* fill icmp reply type table */
 194  194          for (i = 0; i <= ICMP_MAXTYPE; i++)
 195  195                  icmpreplytype4[i] = -1;
 196  196          icmpreplytype4[ICMP_ECHO] = ICMP_ECHOREPLY;
 197  197          icmpreplytype4[ICMP_TSTAMP] = ICMP_TSTAMPREPLY;
 198  198          icmpreplytype4[ICMP_IREQ] = ICMP_IREQREPLY;
 199  199          icmpreplytype4[ICMP_MASKREQ] = ICMP_MASKREPLY;
 200  200  #ifdef  USE_INET6
 201  201          /* fill icmp reply type table */
 202  202          for (i = 0; i <= ICMP6_MAXTYPE; i++)
 203  203                  icmpreplytype6[i] = -1;
 204  204          icmpreplytype6[ICMP6_ECHO_REQUEST] = ICMP6_ECHO_REPLY;
 205  205          icmpreplytype6[ICMP6_MEMBERSHIP_QUERY] = ICMP6_MEMBERSHIP_REPORT;
 206  206          icmpreplytype6[ICMP6_NI_QUERY] = ICMP6_NI_REPLY;
 207  207          icmpreplytype6[ND_ROUTER_SOLICIT] = ND_ROUTER_ADVERT;
 208  208          icmpreplytype6[ND_NEIGHBOR_SOLICIT] = ND_NEIGHBOR_ADVERT;
 209  209  #endif
 210  210  
 211  211          KMALLOCS(ifs->ifs_ips_stats.iss_bucketlen, u_long *,
 212  212                   ifs->ifs_fr_statesize * sizeof(u_long));
 213  213          if (ifs->ifs_ips_stats.iss_bucketlen == NULL)
 214  214                  return -1;
 215  215          bzero((char *)ifs->ifs_ips_stats.iss_bucketlen, 
 216  216                ifs->ifs_fr_statesize * sizeof(u_long));
 217  217  
 218  218          if (ifs->ifs_fr_state_maxbucket == 0) {
 219  219                  for (i = ifs->ifs_fr_statesize; i > 0; i >>= 1)
 220  220                          ifs->ifs_fr_state_maxbucket++;
 221  221                  ifs->ifs_fr_state_maxbucket *= 2;
 222  222          }
 223  223  
 224  224          fr_sttab_init(ifs->ifs_ips_tqtqb, ifs);
 225  225          ifs->ifs_ips_tqtqb[IPF_TCP_NSTATES - 1].ifq_next = &ifs->ifs_ips_udptq;
 226  226          ifs->ifs_ips_udptq.ifq_ttl = (u_long)ifs->ifs_fr_udptimeout;
 227  227          ifs->ifs_ips_udptq.ifq_ref = 1;
 228  228          ifs->ifs_ips_udptq.ifq_head = NULL;
 229  229          ifs->ifs_ips_udptq.ifq_tail = &ifs->ifs_ips_udptq.ifq_head;
 230  230          MUTEX_INIT(&ifs->ifs_ips_udptq.ifq_lock, "ipftq udp tab");
 231  231          ifs->ifs_ips_udptq.ifq_next = &ifs->ifs_ips_udpacktq;
 232  232          ifs->ifs_ips_udpacktq.ifq_ttl = (u_long)ifs->ifs_fr_udpacktimeout;
 233  233          ifs->ifs_ips_udpacktq.ifq_ref = 1;
 234  234          ifs->ifs_ips_udpacktq.ifq_head = NULL;
 235  235          ifs->ifs_ips_udpacktq.ifq_tail = &ifs->ifs_ips_udpacktq.ifq_head;
 236  236          MUTEX_INIT(&ifs->ifs_ips_udpacktq.ifq_lock, "ipftq udpack tab");
 237  237          ifs->ifs_ips_udpacktq.ifq_next = &ifs->ifs_ips_icmptq;
 238  238          ifs->ifs_ips_icmptq.ifq_ttl = (u_long)ifs->ifs_fr_icmptimeout;
 239  239          ifs->ifs_ips_icmptq.ifq_ref = 1;
 240  240          ifs->ifs_ips_icmptq.ifq_head = NULL;
 241  241          ifs->ifs_ips_icmptq.ifq_tail = &ifs->ifs_ips_icmptq.ifq_head;
 242  242          MUTEX_INIT(&ifs->ifs_ips_icmptq.ifq_lock, "ipftq icmp tab");
 243  243          ifs->ifs_ips_icmptq.ifq_next = &ifs->ifs_ips_icmpacktq;
 244  244          ifs->ifs_ips_icmpacktq.ifq_ttl = (u_long)ifs->ifs_fr_icmpacktimeout;
 245  245          ifs->ifs_ips_icmpacktq.ifq_ref = 1;
 246  246          ifs->ifs_ips_icmpacktq.ifq_head = NULL;
 247  247          ifs->ifs_ips_icmpacktq.ifq_tail = &ifs->ifs_ips_icmpacktq.ifq_head;
 248  248          MUTEX_INIT(&ifs->ifs_ips_icmpacktq.ifq_lock, "ipftq icmpack tab");
 249  249          ifs->ifs_ips_icmpacktq.ifq_next = &ifs->ifs_ips_iptq;
 250  250          ifs->ifs_ips_iptq.ifq_ttl = (u_long)ifs->ifs_fr_iptimeout;
 251  251          ifs->ifs_ips_iptq.ifq_ref = 1;
 252  252          ifs->ifs_ips_iptq.ifq_head = NULL;
 253  253          ifs->ifs_ips_iptq.ifq_tail = &ifs->ifs_ips_iptq.ifq_head;
 254  254          MUTEX_INIT(&ifs->ifs_ips_iptq.ifq_lock, "ipftq ip tab");
 255  255          ifs->ifs_ips_iptq.ifq_next = &ifs->ifs_ips_deletetq;
 256  256          /* entry's ttl in deletetq is just 1 tick */
 257  257          ifs->ifs_ips_deletetq.ifq_ttl = (u_long) 1;
 258  258          ifs->ifs_ips_deletetq.ifq_ref = 1;
 259  259          ifs->ifs_ips_deletetq.ifq_head = NULL;
 260  260          ifs->ifs_ips_deletetq.ifq_tail = &ifs->ifs_ips_deletetq.ifq_head;
 261  261          MUTEX_INIT(&ifs->ifs_ips_deletetq.ifq_lock, "state delete queue");
 262  262          ifs->ifs_ips_deletetq.ifq_next = NULL;
 263  263  
 264  264          RWLOCK_INIT(&ifs->ifs_ipf_state, "ipf IP state rwlock");
 265  265          MUTEX_INIT(&ifs->ifs_ipf_stinsert, "ipf state insert mutex");
 266  266          ifs->ifs_fr_state_init = 1;
 267  267  
 268  268          ifs->ifs_ips_last_force_flush = ifs->ifs_fr_ticks;
 269  269          return 0;
 270  270  }
 271  271  
 272  272  
 273  273  /* ------------------------------------------------------------------------ */
 274  274  /* Function:    fr_stateunload                                              */
 275  275  /* Returns:     Nil                                                         */
 276  276  /* Parameters:  ifs - ipf stack instance                                    */
 277  277  /*                                                                          */
 278  278  /* Release and destroy any resources acquired or initialised so that        */
 279  279  /* IPFilter can be unloaded or re-initialised.                              */
 280  280  /* ------------------------------------------------------------------------ */
 281  281  void fr_stateunload(ifs)
 282  282  ipf_stack_t *ifs;
 283  283  {
 284  284          ipftq_t *ifq, *ifqnext;
 285  285          ipstate_t *is;
 286  286  
 287  287          while ((is = ifs->ifs_ips_list) != NULL)
 288  288              (void) fr_delstate(is, 0, ifs);
 289  289  
 290  290          /*
 291  291           * Proxy timeout queues are not cleaned here because although they
 292  292           * exist on the state list, appr_unload is called after fr_stateunload
 293  293           * and the proxies actually are responsible for them being created.
 294  294           * Should the proxy timeouts have their own list?  There's no real
 295  295           * justification as this is the only complicationA
 296  296           */
 297  297          for (ifq = ifs->ifs_ips_utqe; ifq != NULL; ifq = ifqnext) {
 298  298                  ifqnext = ifq->ifq_next;
 299  299                  if (((ifq->ifq_flags & IFQF_PROXY) == 0) &&
 300  300                      (fr_deletetimeoutqueue(ifq) == 0))
 301  301                          fr_freetimeoutqueue(ifq, ifs);
 302  302          }
 303  303  
 304  304          ifs->ifs_ips_stats.iss_inuse = 0;
 305  305          ifs->ifs_ips_num = 0;
 306  306  
 307  307          if (ifs->ifs_fr_state_init == 1) {
 308  308                  fr_sttab_destroy(ifs->ifs_ips_tqtqb);
 309  309                  MUTEX_DESTROY(&ifs->ifs_ips_udptq.ifq_lock);
 310  310                  MUTEX_DESTROY(&ifs->ifs_ips_icmptq.ifq_lock);
 311  311                  MUTEX_DESTROY(&ifs->ifs_ips_udpacktq.ifq_lock);
 312  312                  MUTEX_DESTROY(&ifs->ifs_ips_icmpacktq.ifq_lock);
 313  313                  MUTEX_DESTROY(&ifs->ifs_ips_iptq.ifq_lock);
 314  314                  MUTEX_DESTROY(&ifs->ifs_ips_deletetq.ifq_lock);
 315  315          }
 316  316  
 317  317          if (ifs->ifs_ips_table != NULL) {
 318  318                  KFREES(ifs->ifs_ips_table, 
 319  319                         ifs->ifs_fr_statesize * sizeof(*ifs->ifs_ips_table));
 320  320                  ifs->ifs_ips_table = NULL;
 321  321          }
 322  322  
 323  323          if (ifs->ifs_ips_seed != NULL) {
 324  324                  KFREES(ifs->ifs_ips_seed, 
 325  325                         ifs->ifs_fr_statesize * sizeof(*ifs->ifs_ips_seed));
 326  326                  ifs->ifs_ips_seed = NULL;
 327  327          }
 328  328  
 329  329          if (ifs->ifs_ips_stats.iss_bucketlen != NULL) {
 330  330                  KFREES(ifs->ifs_ips_stats.iss_bucketlen, 
 331  331                         ifs->ifs_fr_statesize * sizeof(u_long));
 332  332                  ifs->ifs_ips_stats.iss_bucketlen = NULL;
 333  333          }
 334  334  
 335  335          if (ifs->ifs_fr_state_maxbucket_reset == 1)
 336  336                  ifs->ifs_fr_state_maxbucket = 0;
 337  337  
 338  338          if (ifs->ifs_fr_state_init == 1) {
 339  339                  ifs->ifs_fr_state_init = 0;
 340  340                  RW_DESTROY(&ifs->ifs_ipf_state);
 341  341                  MUTEX_DESTROY(&ifs->ifs_ipf_stinsert);
 342  342          }
 343  343  }
 344  344  
 345  345  
 346  346  /* ------------------------------------------------------------------------ */
 347  347  /* Function:    fr_statetstats                                              */
 348  348  /* Returns:     ips_state_t* - pointer to state stats structure             */
 349  349  /* Parameters:  Nil                                                         */
 350  350  /*                                                                          */
 351  351  /* Put all the current numbers and pointers into a single struct and return */
 352  352  /* a pointer to it.                                                         */
 353  353  /* ------------------------------------------------------------------------ */
 354  354  static ips_stat_t *fr_statetstats(ifs)
 355  355  ipf_stack_t *ifs;
 356  356  {
 357  357          ifs->ifs_ips_stats.iss_active = ifs->ifs_ips_num;
 358  358          ifs->ifs_ips_stats.iss_statesize = ifs->ifs_fr_statesize;
 359  359          ifs->ifs_ips_stats.iss_statemax = ifs->ifs_fr_statemax;
 360  360          ifs->ifs_ips_stats.iss_table = ifs->ifs_ips_table;
 361  361          ifs->ifs_ips_stats.iss_list = ifs->ifs_ips_list;
 362  362          ifs->ifs_ips_stats.iss_ticks = ifs->ifs_fr_ticks;
 363  363          return &ifs->ifs_ips_stats;
 364  364  }
 365  365  
 366  366  /* ------------------------------------------------------------------------ */
 367  367  /* Function:    fr_state_remove                                             */
 368  368  /* Returns:     int - 0 == success, != 0 == failure                         */
 369  369  /* Parameters:  data(I) - pointer to state structure to delete from table   */
 370  370  /*              ifs - ipf stack instance                                    */
 371  371  /*                                                                          */
 372  372  /* Search for a state structure that matches the one passed, according to   */
 373  373  /* the IP addresses and other protocol specific information.                */
 374  374  /* ------------------------------------------------------------------------ */
 375  375  static int fr_state_remove(data, ifs)
 376  376  caddr_t data;
 377  377  ipf_stack_t *ifs;
 378  378  {
 379  379          ipstate_t *sp, st;
 380  380          int error;
 381  381  
 382  382          sp = &st;
 383  383          error = fr_inobj(data, &st, IPFOBJ_IPSTATE);
 384  384          if (error)
 385  385                  return EFAULT;
 386  386  
 387  387          WRITE_ENTER(&ifs->ifs_ipf_state);
 388  388          for (sp = ifs->ifs_ips_list; sp; sp = sp->is_next)
 389  389                  if ((sp->is_p == st.is_p) && (sp->is_v == st.is_v) &&
 390  390                      !bcmp((caddr_t)&sp->is_src, (caddr_t)&st.is_src,
 391  391                            sizeof(st.is_src)) &&
 392  392                      !bcmp((caddr_t)&sp->is_dst, (caddr_t)&st.is_dst,
 393  393                            sizeof(st.is_dst)) &&
 394  394                      !bcmp((caddr_t)&sp->is_ps, (caddr_t)&st.is_ps,
 395  395                            sizeof(st.is_ps))) {
 396  396                          (void) fr_delstate(sp, ISL_REMOVE, ifs);
 397  397                          RWLOCK_EXIT(&ifs->ifs_ipf_state);
 398  398                          return 0;
 399  399                  }
 400  400          RWLOCK_EXIT(&ifs->ifs_ipf_state);
 401  401          return ESRCH;
 402  402  }
 403  403  
 404  404  
 405  405  /* ------------------------------------------------------------------------ */
 406  406  /* Function:    fr_state_ioctl                                              */
 407  407  /* Returns:     int - 0 == success, != 0 == failure                         */
 408  408  /* Parameters:  data(I) - pointer to ioctl data                             */
 409  409  /*              cmd(I)  - ioctl command integer                             */
 410  410  /*              mode(I) - file mode bits used with open                     */
 411  411  /*              uid(I)  - uid of caller                                     */
 412  412  /*              ctx(I)  - pointer to give the uid context                   */
 413  413  /*              ifs     - ipf stack instance                                */
 414  414  /*                                                                          */
 415  415  /* Processes an ioctl call made to operate on the IP Filter state device.   */
 416  416  /* ------------------------------------------------------------------------ */
 417  417  int fr_state_ioctl(data, cmd, mode, uid, ctx, ifs)
 418  418  caddr_t data;
 419  419  ioctlcmd_t cmd;
 420  420  int mode, uid;
 421  421  void *ctx;
 422  422  ipf_stack_t *ifs;
 423  423  {
 424  424          int arg, ret, error = 0;
 425  425  
 426  426          switch (cmd)
 427  427          {
 428  428          /*
 429  429           * Delete an entry from the state table.
 430  430           */
 431  431          case SIOCDELST :
 432  432                  error = fr_state_remove(data, ifs);
 433  433                  break;
 434  434          /*
 435  435           * Flush the state table
 436  436           */
 437  437          case SIOCIPFFL :
 438  438                  error = BCOPYIN(data, (char *)&arg, sizeof(arg));
 439  439                  if (error != 0) {
 440  440                          error = EFAULT;
 441  441                  } else {
 442  442                          if (VALID_TABLE_FLUSH_OPT(arg)) {
 443  443                                  WRITE_ENTER(&ifs->ifs_ipf_state);
 444  444                                  ret = fr_state_flush(arg, 4, ifs);
 445  445                                  RWLOCK_EXIT(&ifs->ifs_ipf_state);
 446  446                                  error = BCOPYOUT((char *)&ret, data,
 447  447                                                  sizeof(ret));
 448  448                                  if (error != 0)
 449  449                                          return EFAULT;
 450  450                          } else {
 451  451                                  error = EINVAL;
 452  452                          }
 453  453                  }
 454  454                  break;
 455  455  
 456  456  #ifdef  USE_INET6
 457  457          case SIOCIPFL6 :
 458  458                  error = BCOPYIN(data, (char *)&arg, sizeof(arg));
 459  459                  if (error != 0) {
 460  460                          error = EFAULT;
 461  461                  } else {
 462  462                          if (VALID_TABLE_FLUSH_OPT(arg)) {
 463  463                                  WRITE_ENTER(&ifs->ifs_ipf_state);
 464  464                                  ret = fr_state_flush(arg, 6, ifs);
 465  465                                  RWLOCK_EXIT(&ifs->ifs_ipf_state);
 466  466                                  error = BCOPYOUT((char *)&ret, data,
 467  467                                                  sizeof(ret));
 468  468                                  if (error != 0)
 469  469                                          return EFAULT;
 470  470                          } else {
 471  471                                  error = EINVAL;
 472  472                          }
 473  473                  }
 474  474                  break;
 475  475  #endif
 476  476  #ifdef  IPFILTER_LOG
 477  477          /*
 478  478           * Flush the state log.
 479  479           */
 480  480          case SIOCIPFFB :
 481  481                  if (!(mode & FWRITE))
 482  482                          error = EPERM;
 483  483                  else {
 484  484                          int tmp;
 485  485  
 486  486                          tmp = ipflog_clear(IPL_LOGSTATE, ifs);
 487  487                          error = BCOPYOUT((char *)&tmp, data, sizeof(tmp));
 488  488                          if (error != 0)
 489  489                                  error = EFAULT;
 490  490                  }
 491  491                  break;
 492  492          /*
 493  493           * Turn logging of state information on/off.
 494  494           */
 495  495          case SIOCSETLG :
 496  496                  if (!(mode & FWRITE)) {
 497  497                          error = EPERM;
 498  498                  } else {
 499  499                          error = BCOPYIN((char *)data,
 500  500                                          (char *)&ifs->ifs_ipstate_logging,
 501  501                                          sizeof(ifs->ifs_ipstate_logging));
 502  502                          if (error != 0)
 503  503                                  error = EFAULT;
 504  504                  }
 505  505                  break;
 506  506          /*
 507  507           * Return the current state of logging.
 508  508           */
 509  509          case SIOCGETLG :
 510  510                  error = BCOPYOUT((char *)&ifs->ifs_ipstate_logging,
 511  511                                  (char *)data,
 512  512                                  sizeof(ifs->ifs_ipstate_logging));
 513  513                  if (error != 0)
 514  514                          error = EFAULT;
 515  515                  break;
 516  516          /*
 517  517           * Return the number of bytes currently waiting to be read.
 518  518           */
 519  519          case FIONREAD :
 520  520                  arg = ifs->ifs_iplused[IPL_LOGSTATE]; /* returned in an int */
 521  521                  error = BCOPYOUT((char *)&arg, data, sizeof(arg));
 522  522                  if (error != 0)
 523  523                          error = EFAULT;
 524  524                  break;
 525  525  #endif
 526  526          /*
 527  527           * Get the current state statistics.
 528  528           */
 529  529          case SIOCGETFS :
 530  530                  error = fr_outobj(data, fr_statetstats(ifs), IPFOBJ_STATESTAT);
 531  531                  break;
 532  532          /*
 533  533           * Lock/Unlock the state table.  (Locking prevents any changes, which
 534  534           * means no packets match).
 535  535           */
 536  536          case SIOCSTLCK :
 537  537                  if (!(mode & FWRITE)) {
 538  538                          error = EPERM;
 539  539                  } else {
 540  540                          error = fr_lock(data, &ifs->ifs_fr_state_lock);
 541  541                  }
 542  542                  break;
 543  543          /*
 544  544           * Add an entry to the current state table.
 545  545           */
 546  546          case SIOCSTPUT :
 547  547                  if (!ifs->ifs_fr_state_lock || !(mode & FWRITE)) {
 548  548                          error = EACCES;
 549  549                          break;
 550  550                  }
 551  551                  error = fr_stputent(data, ifs);
 552  552                  break;
 553  553          /*
 554  554           * Get a state table entry.
 555  555           */
 556  556          case SIOCSTGET :
 557  557                  if (!ifs->ifs_fr_state_lock) {
 558  558                          error = EACCES;
 559  559                          break;
 560  560                  }
 561  561                  error = fr_stgetent(data, ifs);
 562  562                  break;
 563  563  
 564  564          case SIOCGENITER :
 565  565              {
 566  566                  ipftoken_t *token;
 567  567                  ipfgeniter_t iter;
 568  568  
 569  569                  error = fr_inobj(data, &iter, IPFOBJ_GENITER);
 570  570                  if (error != 0)
 571  571                          break;
 572  572  
 573  573                  token = ipf_findtoken(IPFGENITER_STATE, uid, ctx, ifs);
 574  574                  if (token != NULL)
 575  575                          error = fr_stateiter(token, &iter, ifs);
 576  576                  else
 577  577                          error = ESRCH;
 578  578                  RWLOCK_EXIT(&ifs->ifs_ipf_tokens);
 579  579                  break;
 580  580              }
 581  581  
 582  582          case SIOCIPFDELTOK :
 583  583                  error = BCOPYIN(data, (char *)&arg, sizeof(arg));
 584  584                  if (error != 0) {
 585  585                          error = EFAULT;
 586  586                  } else {
 587  587                          error = ipf_deltoken(arg, uid, ctx, ifs);
 588  588                  }
 589  589                  break;
 590  590  
 591  591          default :
 592  592                  error = EINVAL;
 593  593                  break;
 594  594          }
 595  595          return error;
 596  596  }
 597  597  
 598  598  
 599  599  /* ------------------------------------------------------------------------ */
 600  600  /* Function:    fr_stgetent                                                 */
 601  601  /* Returns:     int - 0 == success, != 0 == failure                         */
 602  602  /* Parameters:  data(I) - pointer to state structure to retrieve from table */
 603  603  /*                                                                          */
 604  604  /* Copy out state information from the kernel to a user space process.  If  */
 605  605  /* there is a filter rule associated with the state entry, copy that out    */
 606  606  /* as well.  The entry to copy out is taken from the value of "ips_next" in */
 607  607  /* the struct passed in and if not null and not found in the list of current*/
 608  608  /* state entries, the retrieval fails.                                      */
 609  609  /* ------------------------------------------------------------------------ */
 610  610  int fr_stgetent(data, ifs)
 611  611  caddr_t data;
 612  612  ipf_stack_t *ifs;
 613  613  {
 614  614          ipstate_t *is, *isn;
 615  615          ipstate_save_t ips;
 616  616          int error;
 617  617  
 618  618          error = fr_inobj(data, &ips, IPFOBJ_STATESAVE);
 619  619          if (error)
 620  620                  return EFAULT;
 621  621  
 622  622          isn = ips.ips_next;
 623  623          if (isn == NULL) {
 624  624                  isn = ifs->ifs_ips_list;
 625  625                  if (isn == NULL) {
 626  626                          if (ips.ips_next == NULL)
 627  627                                  return ENOENT;
 628  628                          return 0;
 629  629                  }
 630  630          } else {
 631  631                  /*
 632  632                   * Make sure the pointer we're copying from exists in the
 633  633                   * current list of entries.  Security precaution to prevent
 634  634                   * copying of random kernel data.
 635  635                   */
 636  636                  for (is = ifs->ifs_ips_list; is; is = is->is_next)
 637  637                          if (is == isn)
 638  638                                  break;
 639  639                  if (!is)
 640  640                          return ESRCH;
 641  641          }
 642  642          ips.ips_next = isn->is_next;
 643  643          bcopy((char *)isn, (char *)&ips.ips_is, sizeof(ips.ips_is));
 644  644          ips.ips_rule = isn->is_rule;
 645  645          if (isn->is_rule != NULL)
 646  646                  bcopy((char *)isn->is_rule, (char *)&ips.ips_fr,
 647  647                        sizeof(ips.ips_fr));
 648  648          error = fr_outobj(data, &ips, IPFOBJ_STATESAVE);
 649  649          if (error)
 650  650                  return EFAULT;
 651  651          return 0;
 652  652  }
 653  653  
 654  654  
 655  655  /* ------------------------------------------------------------------------ */
 656  656  /* Function:    fr_stputent                                                 */
 657  657  /* Returns:     int - 0 == success, != 0 == failure                         */
 658  658  /* Parameters:  data(I) - pointer to state information struct               */
 659  659  /*              ifs     - ipf stack instance                                */
 660  660  /*                                                                          */
 661  661  /* This function implements the SIOCSTPUT ioctl: insert a state entry into  */
 662  662  /* the state table.  If the state info. includes a pointer to a filter rule */
 663  663  /* then also add in an orphaned rule (will not show up in any "ipfstat -io" */
 664  664  /* output.                                                                  */
 665  665  /* ------------------------------------------------------------------------ */
 666  666  int fr_stputent(data, ifs)
 667  667  caddr_t data;
 668  668  ipf_stack_t *ifs;
 669  669  {
 670  670          ipstate_t *is, *isn;
 671  671          ipstate_save_t ips;
 672  672          int error, i;
 673  673          frentry_t *fr;
 674  674          char *name;
 675  675  
 676  676          error = fr_inobj(data, &ips, IPFOBJ_STATESAVE);
 677  677          if (error)
 678  678                  return EFAULT;
 679  679  
 680  680          /*
 681  681           * Trigger automatic call to fr_state_flush() if the
 682  682           * table has reached capacity specified by hi watermark.
 683  683           */
 684  684          if (ST_TAB_WATER_LEVEL(ifs) > ifs->ifs_state_flush_level_hi)
 685  685                  ifs->ifs_fr_state_doflush = 1;
 686  686  
 687  687          /*
 688  688           * If automatic flushing did not do its job, and the table
 689  689           * has filled up, don't try to create a new entry.
 690  690           */
 691  691          if (ifs->ifs_ips_num >= ifs->ifs_fr_statemax) {
 692  692                  ATOMIC_INCL(ifs->ifs_ips_stats.iss_max);
 693  693                  return ENOMEM;
 694  694          }
 695  695  
 696  696          KMALLOC(isn, ipstate_t *);
 697  697          if (isn == NULL)
 698  698                  return ENOMEM;
 699  699  
 700  700          bcopy((char *)&ips.ips_is, (char *)isn, sizeof(*isn));
 701  701          bzero((char *)isn, offsetof(struct ipstate, is_pkts));
 702  702          isn->is_sti.tqe_pnext = NULL;
 703  703          isn->is_sti.tqe_next = NULL;
 704  704          isn->is_sti.tqe_ifq = NULL;
 705  705          isn->is_sti.tqe_parent = isn;
 706  706          isn->is_ifp[0] = NULL;
 707  707          isn->is_ifp[1] = NULL;
 708  708          isn->is_ifp[2] = NULL;
 709  709          isn->is_ifp[3] = NULL;
 710  710          isn->is_sync = NULL;
 711  711          fr = ips.ips_rule;
 712  712  
 713  713          if (fr == NULL) {
 714  714                  READ_ENTER(&ifs->ifs_ipf_state);
 715  715                  fr_stinsert(isn, 0, ifs);
 716  716                  MUTEX_EXIT(&isn->is_lock);
 717  717                  RWLOCK_EXIT(&ifs->ifs_ipf_state);
 718  718                  return 0;
 719  719          }
 720  720  
 721  721          if (isn->is_flags & SI_NEWFR) {
 722  722                  KMALLOC(fr, frentry_t *);
 723  723                  if (fr == NULL) {
 724  724                          KFREE(isn);
 725  725                          return ENOMEM;
 726  726                  }
 727  727                  bcopy((char *)&ips.ips_fr, (char *)fr, sizeof(*fr));
 728  728                  isn->is_rule = fr;
 729  729                  ips.ips_is.is_rule = fr;
 730  730                  MUTEX_NUKE(&fr->fr_lock);
 731  731                  MUTEX_INIT(&fr->fr_lock, "state filter rule lock");
 732  732  
 733  733                  /*
 734  734                   * Look up all the interface names in the rule.
 735  735                   */
 736  736                  for (i = 0; i < 4; i++) {
 737  737                          name = fr->fr_ifnames[i];
 738  738                          fr->fr_ifas[i] = fr_resolvenic(name, fr->fr_v, ifs);
 739  739                          name = isn->is_ifname[i];
 740  740                          isn->is_ifp[i] = fr_resolvenic(name, isn->is_v, ifs);
 741  741                  }
 742  742  
 743  743                  fr->fr_ref = 0;
 744  744                  fr->fr_dsize = 0;
 745  745                  fr->fr_data = NULL;
 746  746                  fr->fr_type = FR_T_NONE;
 747  747  
 748  748                  fr_resolvedest(&fr->fr_tif, fr->fr_v, ifs);
 749  749                  fr_resolvedest(&fr->fr_dif, fr->fr_v, ifs);
 750  750                  fr_resolvedest(&fr->fr_rif, fr->fr_v, ifs);
 751  751  
 752  752                  /*
 753  753                   * send a copy back to userland of what we ended up
 754  754                   * to allow for verification.
 755  755                   */
 756  756                  error = fr_outobj(data, &ips, IPFOBJ_STATESAVE);
 757  757                  if (error) {
 758  758                          KFREE(isn);
 759  759                          MUTEX_DESTROY(&fr->fr_lock);
 760  760                          KFREE(fr);
 761  761                          return EFAULT;
 762  762                  }
 763  763                  READ_ENTER(&ifs->ifs_ipf_state);
 764  764                  fr_stinsert(isn, 0, ifs);
 765  765                  MUTEX_EXIT(&isn->is_lock);
 766  766                  RWLOCK_EXIT(&ifs->ifs_ipf_state);
 767  767  
 768  768          } else {
 769  769                  READ_ENTER(&ifs->ifs_ipf_state);
 770  770                  for (is = ifs->ifs_ips_list; is; is = is->is_next)
 771  771                          if (is->is_rule == fr) {
 772  772                                  fr_stinsert(isn, 0, ifs);
 773  773                                  MUTEX_EXIT(&isn->is_lock);
 774  774                                  break;
 775  775                          }
 776  776  
 777  777                  if (is == NULL) {
 778  778                          KFREE(isn);
 779  779                          isn = NULL;
 780  780                  }
 781  781                  RWLOCK_EXIT(&ifs->ifs_ipf_state);
 782  782  
 783  783                  return (isn == NULL) ? ESRCH : 0;
 784  784          }
 785  785  
 786  786          return 0;
 787  787  }
 788  788  
 789  789  
 790  790  /* ------------------------------------------------------------------------ */
 791  791  /* Function:   fr_stinsert                                                  */
 792  792  /* Returns:    Nil                                                          */
 793  793  /* Parameters: is(I)  - pointer to state structure                          */
 794  794  /*             rev(I) - flag indicating forward/reverse direction of packet */
 795  795  /*                                                                          */
 796  796  /* Inserts a state structure into the hash table (for lookups) and the list */
 797  797  /* of state entries (for enumeration).  Resolves all of the interface names */
 798  798  /* to pointers and adjusts running stats for the hash table as appropriate. */
 799  799  /*                                                                          */
 800  800  /* Locking: it is assumed that some kind of lock on ipf_state is held.      */
 801  801  /*          Exits with is_lock initialised and held.                        */
 802  802  /* ------------------------------------------------------------------------ */
 803  803  void fr_stinsert(is, rev, ifs)
 804  804  ipstate_t *is;
 805  805  int rev;
 806  806  ipf_stack_t *ifs;
 807  807  {
 808  808          frentry_t *fr;
 809  809          u_int hv;
 810  810          int i;
 811  811  
 812  812          MUTEX_INIT(&is->is_lock, "ipf state entry");
 813  813  
 814  814          fr = is->is_rule;
 815  815          if (fr != NULL) {
 816  816                  MUTEX_ENTER(&fr->fr_lock);
 817  817                  fr->fr_ref++;
 818  818                  fr->fr_statecnt++;
 819  819                  MUTEX_EXIT(&fr->fr_lock);
 820  820          }
 821  821  
 822  822          /*
 823  823           * Look up all the interface names in the state entry.
 824  824           */
 825  825          for (i = 0; i < 4; i++) {
 826  826                  if (is->is_ifp[i] != NULL)
 827  827                          continue;
 828  828                  is->is_ifp[i] = fr_resolvenic(is->is_ifname[i], is->is_v, ifs);
 829  829          }
 830  830  
 831  831          /*
 832  832           * If we could trust is_hv, then the modulous would not be needed, but
 833  833           * when running with IPFILTER_SYNC, this stops bad values.
 834  834           */
 835  835          hv = is->is_hv % ifs->ifs_fr_statesize;
 836  836          is->is_hv = hv;
 837  837  
 838  838          /*
 839  839           * We need to get both of these locks...the first because it is
 840  840           * possible that once the insert is complete another packet might
 841  841           * come along, match the entry and want to update it.
 842  842           */
 843  843          MUTEX_ENTER(&is->is_lock);
 844  844          MUTEX_ENTER(&ifs->ifs_ipf_stinsert);
 845  845  
 846  846          /*
 847  847           * add into list table.
 848  848           */
 849  849          if (ifs->ifs_ips_list != NULL)
 850  850                  ifs->ifs_ips_list->is_pnext = &is->is_next;
 851  851          is->is_pnext = &ifs->ifs_ips_list;
 852  852          is->is_next = ifs->ifs_ips_list;
 853  853          ifs->ifs_ips_list = is;
 854  854  
 855  855          if (ifs->ifs_ips_table[hv] != NULL)
 856  856                  ifs->ifs_ips_table[hv]->is_phnext = &is->is_hnext;
 857  857          else
 858  858                  ifs->ifs_ips_stats.iss_inuse++;
 859  859          is->is_phnext = ifs->ifs_ips_table + hv;
 860  860          is->is_hnext = ifs->ifs_ips_table[hv];
 861  861          ifs->ifs_ips_table[hv] = is;
 862  862          ifs->ifs_ips_stats.iss_bucketlen[hv]++;
 863  863          ifs->ifs_ips_num++;
 864  864          MUTEX_EXIT(&ifs->ifs_ipf_stinsert);
 865  865  
 866  866          fr_setstatequeue(is, rev, ifs);
 867  867  }
 868  868  
 869  869  /* ------------------------------------------------------------------------ */
 870  870  /* Function:    fr_match_ipv4addrs                                          */
 871  871  /* Returns:     int -   2 strong match (same addresses, same direction)     */
 872  872  /*                      1 weak match (same address, opposite direction)     */
 873  873  /*                      0 no match                                          */
 874  874  /*                                                                          */
 875  875  /* Function matches IPv4 addresses.                                         */
 876  876  /* ------------------------------------------------------------------------ */
 877  877  static int fr_match_ipv4addrs(is1, is2)
 878  878  ipstate_t *is1;
 879  879  ipstate_t *is2;
 880  880  {
 881  881          int     rv;
 882  882  
 883  883          if (is1->is_saddr == is2->is_saddr && is1->is_daddr == is2->is_daddr)
 884  884                  rv = 2;
 885  885          else if (is1->is_saddr == is2->is_daddr &&
 886  886              is1->is_daddr == is2->is_saddr)
 887  887                  rv = 1;
 888  888          else
 889  889                  rv = 0;
 890  890  
 891  891          return (rv);
 892  892  }
 893  893  
 894  894  /* ------------------------------------------------------------------------ */
 895  895  /* Function:    fr_match_ipv6addrs                                          */
 896  896  /* Returns:     int -   2 strong match (same addresses, same direction)     */
 897  897  /*                      1 weak match (same addresses, opposite direction)   */
 898  898  /*                      0 no match                                          */
 899  899  /*                                                                          */
 900  900  /* Function matches IPv6 addresses.                                         */
 901  901  /* ------------------------------------------------------------------------ */
 902  902  static int fr_match_ipv6addrs(is1, is2)
 903  903  ipstate_t *is1;
 904  904  ipstate_t *is2;
 905  905  {
 906  906          int     rv;
 907  907  
 908  908          if (IP6_EQ(&is1->is_src, &is2->is_src) &&
 909  909              IP6_EQ(&is1->is_dst, &is2->is_dst))
 910  910                  rv = 2;
 911  911          else if (IP6_EQ(&is1->is_src, &is2->is_dst) &&
 912  912              IP6_EQ(&is1->is_dst, &is2->is_src)) {
 913  913                  rv = 1;
 914  914          }
 915  915          else
 916  916                  rv = 0;
 917  917  
 918  918          return (rv);
 919  919  }
 920  920  /* ------------------------------------------------------------------------ */
 921  921  /* Function:    fr_match_addresses                                          */
 922  922  /* Returns:     int -   2 strong match (same addresses, same direction)     */
 923  923  /*                      1 weak match (same address, opposite directions)    */
 924  924  /*                      0 no match                                          */
 925  925  /* Parameters:  is1, is2 pointers to states we are checking                 */
 926  926  /*                                                                          */
 927  927  /* Matches addresses, function uses fr_match_ipvXaddrs() to deal with IPv4  */
 928  928  /* and IPv6 address format.                                                 */
 929  929  /* ------------------------------------------------------------------------ */
 930  930  static int fr_match_addresses(is1, is2)
 931  931  ipstate_t *is1;
 932  932  ipstate_t *is2;
 933  933  {
 934  934          int     rv;
 935  935  
 936  936          if (is1->is_v == 4) {
 937  937                  rv = fr_match_ipv4addrs(is1, is2);
 938  938          } else {
 939  939                  rv = fr_match_ipv6addrs(is1, is2);
 940  940          }
 941  941  
 942  942          return (rv);
 943  943  }
 944  944  
 945  945  /* ------------------------------------------------------------------------ */
 946  946  /* Function:    fr_match_ppairs                                             */
 947  947  /* Returns:     int -   2 strong match (same ports, same direction)         */
 948  948  /*                      1 weak match (same ports, different direction)      */
 949  949  /*                      0 no match                                          */
 950  950  /* Parameters   ppairs1, ppairs - src, dst ports we want to match.          */
 951  951  /*                                                                          */
 952  952  /* Matches two port_pair_t types (port pairs). Each port pair contains      */
 953  953  /* src, dst port, which belong to session (state entry).                    */
 954  954  /* ------------------------------------------------------------------------ */
 955  955  static int fr_match_ppairs(ppairs1, ppairs2)
 956  956  port_pair_t *ppairs1;
 957  957  port_pair_t *ppairs2;
 958  958  {
 959  959          int     rv;
 960  960  
 961  961          if (ppairs1->pp_sport == ppairs2->pp_sport &&
 962  962              ppairs1->pp_dport == ppairs2->pp_dport)
 963  963                  rv = 2;
 964  964          else if (ppairs1->pp_sport == ppairs2->pp_dport &&
 965  965                      ppairs1->pp_dport == ppairs2->pp_sport)
 966  966                  rv = 1;
 967  967          else
 968  968                  rv = 0;
 969  969  
 970  970          return (rv);
 971  971  }
 972  972  
 973  973  /* ------------------------------------------------------------------------ */
 974  974  /* Function:    fr_match_l4_hdr                                             */
 975  975  /* Returns:     int -   0 no match,                                         */
 976  976  /*                      1 weak match (same ports, different directions)     */
 977  977  /*                      2 strong match (same ports, same direction)         */
 978  978  /* Parameters   is1, is2 - states we want to match                          */
 979  979  /*                                                                          */
 980  980  /* Function matches L4 header data (source ports for TCP, UDP, CallIds for  */
 981  981  /* GRE protocol).                                                           */
 982  982  /* ------------------------------------------------------------------------ */
 983  983  static int fr_match_l4_hdr(is1, is2)
 984  984  ipstate_t *is1;
 985  985  ipstate_t *is2;
 986  986  {
 987  987          int     rv = 0;
 988  988          port_pair_t     pp1;
 989  989          port_pair_t     pp2;
 990  990  
 991  991          if (is1->is_p != is2->is_p)
 992  992                  return (0);
 993  993  
 994  994          switch (is1->is_p) {
 995  995                  case    IPPROTO_TCP:
 996  996                          pp1.pp_sport = is1->is_ps.is_ts.ts_sport;
 997  997                          pp1.pp_dport = is1->is_ps.is_ts.ts_dport;
 998  998                          pp2.pp_sport = is2->is_ps.is_ts.ts_sport;
 999  999                          pp2.pp_dport = is2->is_ps.is_ts.ts_dport;
1000 1000                          rv = fr_match_ppairs(&pp1, &pp2);
1001 1001                          break;
1002 1002                  case    IPPROTO_UDP:
1003 1003                          pp1.pp_sport = is1->is_ps.is_us.us_sport;
1004 1004                          pp1.pp_dport = is1->is_ps.is_us.us_dport;
1005 1005                          pp2.pp_sport = is2->is_ps.is_us.us_sport;
1006 1006                          pp2.pp_dport = is2->is_ps.is_us.us_dport;
1007 1007                          rv = fr_match_ppairs(&pp1, &pp2);
1008 1008                          break;
1009 1009                  case    IPPROTO_GRE:
1010 1010                          /* greinfo_t can be also interprted as port pair */
1011 1011                          pp1.pp_sport = is1->is_ps.is_ug.gs_call[0];
1012 1012                          pp1.pp_dport = is1->is_ps.is_ug.gs_call[1];
1013 1013                          pp2.pp_sport = is2->is_ps.is_ug.gs_call[0];
1014 1014                          pp2.pp_dport = is2->is_ps.is_ug.gs_call[1];
1015 1015                          rv = fr_match_ppairs(&pp1, &pp2);
1016 1016                          break;
1017 1017                  case    IPPROTO_ICMP:
1018 1018                  case    IPPROTO_ICMPV6:
1019 1019                          if (bcmp(&is1->is_ps, &is2->is_ps, sizeof (icmpinfo_t)))
1020 1020                                  rv = 1;
1021 1021                          else
1022 1022                                  rv = 0;
1023 1023                          break;
1024 1024                  default:
1025 1025                          rv = 0;
1026 1026          }
1027 1027  
1028 1028          return (rv);
1029 1029  }
1030 1030  
1031 1031  /* ------------------------------------------------------------------------ */
1032 1032  /* Function:    fr_matchstates                                              */
1033 1033  /* Returns:     int - nonzero match, zero no match                          */
1034 1034  /* Parameters   is1, is2 - states we want to match                          */
1035 1035  /*                                                                          */
1036 1036  /* The state entries are equal (identical match) if they belong to the same */
1037 1037  /* session. Any time new state entry is being added the fr_addstate()       */
1038 1038  /* function creates temporal state entry from the data it gets from IP and  */
1039 1039  /* L4 header. The fr_matchstats() must be also aware of packet direction,   */
1040 1040  /* which is also stored within the state entry. We should keep in mind the  */
1041 1041  /* information about packet direction is spread accross L3 (addresses) and  */
1042 1042  /* L4 (ports). There are three possible relationships betwee is1, is2:      */
1043 1043  /*              - no match (match(is1, is2) == 0))                          */
1044 1044  /*              - weak match same addresses (ports), but different          */
1045 1045  /*                      directions (1)  (fr_match_xxxx(is1, is2) == 1)      */
1046 1046  /*              - strong match same addresses (ports) and same directions   */
1047 1047  /*                       (2) (fr_match_xxxx(is1, is2) == 2)                 */
1048 1048  /*                                                                          */
1049 1049  /* There are functions, which match match addresses (L3 header) in is1, is2 */
1050 1050  /* and functions, which are used to compare ports (L4 header) data. We say  */
1051 1051  /* the is1 and is2 are same (identical) if there is a match                 */
1052 1052  /* (fr_match_l4_hdr(is1, is2) != 0) and matchlevels are same for entries    */
1053 1053  /* (fr_match_l3_hdr(is1, is2) == fr_match_l4_hdr(is1, is2)) for is1, is2.   */
1054 1054  /* Such requirement deals with case as follows:                             */
1055 1055  /*      suppose there are two connections between hosts A, B. Connection 1: */
1056 1056  /*                      a.a.a.a:12345 <=> b.b.b.b:54321                     */
1057 1057  /*              Connection 2:                                               */
1058 1058  /*                      a.a.a.a:54321 <=> b.b.b.b:12345                     */
1059 1059  /* since we've introduced match levels into our fr_matchstates(), we are    */
1060 1060  /* able to identify, which packets belong to connection A and which belong  */
1061 1061  /* to connection B.     Assume there are two entries is1, is2. is1 has been */
1062 1062  /* from con. 1 packet, which travelled from A to B:                         */
1063 1063  /*                      a.a.a.a:12345 -> b.b.b.b:54321                      */
1064 1064  /* while s2, has been created from packet which belongs to con. 2 and is    */
1065 1065  /* also coming from A to B:                                                 */
1066 1066  /*                      a.a.a.a:54321 -> b.b.b.b:12345                      */
1067 1067  /* fr_match_l3_hdr(is1, is2) == 2 -> strong match, while                    */
1068 1068  /* fr_match_l4_hdr(is1, is2) == 1 -> weak match. Since match levels are     */
1069 1069  /* different the state entries are not identical -> no match as a final     */
1070 1070  /* result.                                                                  */
1071 1071  /* ------------------------------------------------------------------------ */
1072 1072  static int fr_matchstates(is1, is2)
1073 1073  ipstate_t *is1;
1074 1074  ipstate_t *is2;
1075 1075  {
1076 1076          int     rv;
1077 1077          int     amatch;
1078 1078          int     pmatch;
1079 1079  
1080 1080          if (bcmp(&is1->is_pass, &is2->is_pass,
1081 1081                  offsetof(struct ipstate, is_ps) -
1082 1082                  offsetof(struct ipstate, is_pass)) == 0) {
1083 1083  
1084 1084                  pmatch = fr_match_l4_hdr(is1, is2);
1085 1085                  amatch = fr_match_addresses(is1, is2);
1086 1086                  /*
1087 1087                   * If addresses match (amatch != 0), then 'match levels'
1088 1088                   * must be same for matching entries. If amatch and pmatch
1089 1089                   * have different values (different match levels), then
1090 1090                   * is1 and is2 belong to different sessions.
1091 1091                   */
1092 1092                  rv = (amatch != 0) && (amatch == pmatch);
1093 1093          }
1094 1094          else
1095 1095                  rv = 0;
1096 1096  
1097 1097          return (rv);
1098 1098  }
1099 1099  
1100 1100  /* ------------------------------------------------------------------------ */
1101 1101  /* Function:    fr_addstate                                                 */
1102 1102  /* Returns:     ipstate_t* - NULL == failure, else pointer to new state     */
1103 1103  /* Parameters:  fin(I)    - pointer to packet information                   */
1104 1104  /*              stsave(O) - pointer to place to save pointer to created     */
1105 1105  /*                          state structure.                                */
1106 1106  /*              flags(I)  - flags to use when creating the structure        */
1107 1107  /*                                                                          */
1108 1108  /* Creates a new IP state structure from the packet information collected.  */
1109 1109  /* Inserts it into the state table and appends to the bottom of the active  */
1110 1110  /* list.  If the capacity of the table has reached the maximum allowed then */
1111 1111  /* the call will fail and a flush is scheduled for the next timeout call.   */
1112 1112  /* ------------------------------------------------------------------------ */
1113 1113  ipstate_t *fr_addstate(fin, stsave, flags)
1114 1114  fr_info_t *fin;
1115 1115  ipstate_t **stsave;
1116 1116  u_int flags;
1117 1117  {
1118 1118          ipstate_t *is, ips;
1119 1119          struct icmp *ic;
1120 1120          u_int pass, hv;
1121 1121          frentry_t *fr;
1122 1122          tcphdr_t *tcp;
1123 1123          grehdr_t *gre;
1124 1124          void *ifp;
1125 1125          int out;
1126 1126          ipf_stack_t *ifs = fin->fin_ifs;
1127 1127  
1128 1128          if (ifs->ifs_fr_state_lock ||
1129 1129              (fin->fin_flx & (FI_SHORT|FI_STATE|FI_FRAGBODY|FI_BAD)))
1130 1130                  return NULL;
1131 1131  
1132 1132          if ((fin->fin_flx & FI_OOW) && !(fin->fin_tcpf & TH_SYN))
1133 1133                  return NULL;
1134 1134  
1135 1135          /*
1136 1136           * Trigger automatic call to fr_state_flush() if the
1137 1137           * table has reached capacity specified by hi watermark.
1138 1138           */
1139 1139          if (ST_TAB_WATER_LEVEL(ifs) > ifs->ifs_state_flush_level_hi)
1140 1140                  ifs->ifs_fr_state_doflush = 1;
1141 1141  
1142 1142          /*
1143 1143           * If the max number of state entries has been reached, and there is no
1144 1144           * limit on the state count for the rule, then do not continue.  In the
1145 1145           * case where a limit exists, it's ok allow the entries to be created as
1146 1146           * long as specified limit itself has not been reached. 
1147 1147           *
1148 1148           * Note that because the lock isn't held on fr, it is possible to exceed
1149 1149           * the specified size of the table.  However, the cost of this is being
1150 1150           * ignored here; as the number by which it can go over is a product of
1151 1151           * the number of simultaneous threads that could be executing in here.
1152 1152           * So, a limit of 100 won't result in 200, but could result in 101 or 102.
1153 1153           *
1154 1154           * Also note that, since the automatic flush should have been triggered
1155 1155           * well before we reach the maximum number of state table entries, the
1156 1156           * likelihood of reaching the max (and thus exceedng it) is minimal.
1157 1157           */ 
1158 1158          fr = fin->fin_fr;
1159 1159          if (fr != NULL) {
1160 1160                  if ((ifs->ifs_ips_num >= ifs->ifs_fr_statemax) &&
1161 1161                      (fr->fr_statemax == 0)) {
1162 1162                          ATOMIC_INCL(ifs->ifs_ips_stats.iss_max);
1163 1163                          return NULL;
1164 1164                  }
1165 1165                  if ((fr->fr_statemax != 0) &&
1166 1166                      (fr->fr_statecnt >= fr->fr_statemax)) {
1167 1167                          ATOMIC_INCL(ifs->ifs_ips_stats.iss_maxref);
1168 1168                          ifs->ifs_fr_state_doflush = 1;
1169 1169                          return NULL;
1170 1170                  }
1171 1171          }
1172 1172  
1173 1173          ic = NULL;
1174 1174          tcp = NULL;
1175 1175          out = fin->fin_out;
1176 1176          is = &ips;
1177 1177          bzero((char *)is, sizeof(*is));
1178 1178  
1179 1179          if (fr == NULL) {
1180 1180                  pass = ifs->ifs_fr_flags;
1181 1181                  is->is_tag = FR_NOLOGTAG;
1182 1182          } else {
1183 1183                  pass = fr->fr_flags;
1184 1184          }
1185 1185  
1186 1186          is->is_die = 1 + ifs->ifs_fr_ticks;
1187 1187          /*
1188 1188           * We want to check everything that is a property of this packet,
1189 1189           * but we don't (automatically) care about it's fragment status as
1190 1190           * this may change.
1191 1191           */
1192 1192          is->is_pass = pass;
1193 1193          is->is_v = fin->fin_v;
1194 1194          is->is_opt[0] = fin->fin_optmsk;
1195 1195          is->is_optmsk[0] = 0xffffffff;
1196 1196          /*
1197 1197           * The reverse direction option mask will be set in fr_matchsrcdst(),
1198 1198           * when we will see the first packet from the peer. We will leave it
1199 1199           * as zero for now.
1200 1200           */
1201 1201          is->is_optmsk[1] = 0x0;
1202 1202  
1203 1203          if (is->is_v == 6) {
1204 1204                  is->is_opt[0] &= ~0x8;
1205 1205                  is->is_optmsk[0] &= ~0x8;
1206 1206          }
1207 1207          is->is_sec = fin->fin_secmsk;
1208 1208          is->is_secmsk = 0xffff;
1209 1209          is->is_auth = fin->fin_auth;
1210 1210          is->is_authmsk = 0xffff;
1211 1211  
1212 1212          /*
1213 1213           * Copy and calculate...
1214 1214           */
1215 1215          hv = (is->is_p = fin->fin_fi.fi_p);
1216 1216          is->is_src = fin->fin_fi.fi_src;
1217 1217          hv += is->is_saddr;
1218 1218          is->is_dst = fin->fin_fi.fi_dst;
1219 1219          hv += is->is_daddr;
1220 1220  #ifdef  USE_INET6
1221 1221          if (fin->fin_v == 6) {
1222 1222                  /*
1223 1223                   * For ICMPv6, we check to see if the destination address is
1224 1224                   * a multicast address.  If it is, do not include it in the
1225 1225                   * calculation of the hash because the correct reply will come
1226 1226                   * back from a real address, not a multicast address.
1227 1227                   */
1228 1228                  if ((is->is_p == IPPROTO_ICMPV6) &&
1229 1229                      IN6_IS_ADDR_MULTICAST(&is->is_dst.in6)) {
1230 1230                          /*
1231 1231                           * So you can do keep state with neighbour discovery.
1232 1232                           *
1233 1233                           * Here we could use the address from the neighbour
1234 1234                           * solicit message to put in the state structure and
1235 1235                           * we could use that without a wildcard flag too...
1236 1236                           */
1237 1237                          is->is_flags |= SI_W_DADDR;
1238 1238                          hv -= is->is_daddr;
1239 1239                  } else {
1240 1240                          hv += is->is_dst.i6[1];
1241 1241                          hv += is->is_dst.i6[2];
1242 1242                          hv += is->is_dst.i6[3];
1243 1243                  }
1244 1244                  hv += is->is_src.i6[1];
1245 1245                  hv += is->is_src.i6[2];
1246 1246                  hv += is->is_src.i6[3];
1247 1247          }
1248 1248  #endif
1249 1249          if ((fin->fin_v == 4) &&
1250 1250              (fin->fin_flx & (FI_MULTICAST|FI_BROADCAST|FI_MBCAST))) {
1251 1251                  if (fin->fin_out == 0) {
1252 1252                          flags |= SI_W_DADDR|SI_CLONE;
1253 1253                          hv -= is->is_daddr;
1254 1254                  } else {
1255 1255                          flags |= SI_W_SADDR|SI_CLONE;
1256 1256                          hv -= is->is_saddr;
1257 1257                  }
1258 1258          }
1259 1259  
1260 1260          switch (is->is_p)
1261 1261          {
1262 1262  #ifdef  USE_INET6
1263 1263          case IPPROTO_ICMPV6 :
1264 1264                  ic = fin->fin_dp;
1265 1265  
1266 1266                  switch (ic->icmp_type)
1267 1267                  {
1268 1268                  case ICMP6_ECHO_REQUEST :
1269 1269                          is->is_icmp.ici_type = ic->icmp_type;
1270 1270                          hv += (is->is_icmp.ici_id = ic->icmp_id);
1271 1271                          break;
1272 1272                  case ICMP6_MEMBERSHIP_QUERY :
1273 1273                  case ND_ROUTER_SOLICIT :
1274 1274                  case ND_NEIGHBOR_SOLICIT :
1275 1275                  case ICMP6_NI_QUERY :
1276 1276                          is->is_icmp.ici_type = ic->icmp_type;
1277 1277                          break;
1278 1278                  default :
1279 1279                          return NULL;
1280 1280                  }
1281 1281                  ATOMIC_INCL(ifs->ifs_ips_stats.iss_icmp);
1282 1282                  break;
1283 1283  #endif
1284 1284          case IPPROTO_ICMP :
1285 1285                  ic = fin->fin_dp;
1286 1286  
1287 1287                  switch (ic->icmp_type)
1288 1288                  {
1289 1289                  case ICMP_ECHO :
1290 1290                  case ICMP_ECHOREPLY :
1291 1291                  case ICMP_TSTAMP :
1292 1292                  case ICMP_IREQ :
1293 1293                  case ICMP_MASKREQ :
1294 1294                          is->is_icmp.ici_type = ic->icmp_type;
1295 1295                          hv += (is->is_icmp.ici_id = ic->icmp_id);
1296 1296                          break;
1297 1297                  default :
1298 1298                          return NULL;
1299 1299                  }
1300 1300                  ATOMIC_INCL(ifs->ifs_ips_stats.iss_icmp);
1301 1301                  break;
1302 1302  
1303 1303          case IPPROTO_GRE :
1304 1304                  gre = fin->fin_dp;
1305 1305  
1306 1306                  is->is_gre.gs_flags = gre->gr_flags;
1307 1307                  is->is_gre.gs_ptype = gre->gr_ptype;
1308 1308                  if (GRE_REV(is->is_gre.gs_flags) == 1) {
1309 1309                          is->is_call[0] = fin->fin_data[0];
1310 1310                          is->is_call[1] = fin->fin_data[1];
1311 1311                  }
1312 1312                  break;
1313 1313  
1314 1314          case IPPROTO_TCP :
1315 1315                  tcp = fin->fin_dp;
1316 1316  
1317 1317                  if (tcp->th_flags & TH_RST)
1318 1318                          return NULL;
1319 1319                  /*
1320 1320                   * The endian of the ports doesn't matter, but the ack and
1321 1321                   * sequence numbers do as we do mathematics on them later.
1322 1322                   */
1323 1323                  is->is_sport = htons(fin->fin_data[0]);
1324 1324                  is->is_dport = htons(fin->fin_data[1]);
1325 1325                  if ((flags & (SI_W_DPORT|SI_W_SPORT)) == 0) {
1326 1326                          hv += is->is_sport;
1327 1327                          hv += is->is_dport;
1328 1328                  }
1329 1329  
1330 1330                  /*
1331 1331                   * If this is a real packet then initialise fields in the
1332 1332                   * state information structure from the TCP header information.
1333 1333                   */
1334 1334  
1335 1335                  is->is_maxdwin = 1;
1336 1336                  is->is_maxswin = ntohs(tcp->th_win);
1337 1337                  if (is->is_maxswin == 0)
1338 1338                          is->is_maxswin = 1;
1339 1339  
1340 1340                  if ((fin->fin_flx & FI_IGNORE) == 0) {
1341 1341                          is->is_send = ntohl(tcp->th_seq) + fin->fin_dlen -
1342 1342                                        (TCP_OFF(tcp) << 2) +
1343 1343                                        ((tcp->th_flags & TH_SYN) ? 1 : 0) +
1344 1344                                        ((tcp->th_flags & TH_FIN) ? 1 : 0);
1345 1345                          is->is_maxsend = is->is_send;
1346 1346  
1347 1347                          /*
1348 1348                           * Window scale option is only present in
1349 1349                           * SYN/SYN-ACK packet.
1350 1350                           */
1351 1351                          if ((tcp->th_flags & ~(TH_FIN|TH_ACK|TH_ECNALL)) ==
1352 1352                              TH_SYN &&
1353 1353                              (TCP_OFF(tcp) > (sizeof(tcphdr_t) >> 2))) {
1354 1354                                  if (fr_tcpoptions(fin, tcp,
1355 1355                                          &is->is_tcp.ts_data[0]) == -1) {
1356 1356                                          fin->fin_flx |= FI_BAD;
1357 1357                                  }
1358 1358                          }
1359 1359  
1360 1360                          if ((fin->fin_out != 0) && (pass & FR_NEWISN) != 0) {
1361 1361                                  fr_checknewisn(fin, is);
1362 1362                                  fr_fixoutisn(fin, is);
1363 1363                          }
1364 1364  
1365 1365                          if ((tcp->th_flags & TH_OPENING) == TH_SYN)
1366 1366                                  flags |= IS_TCPFSM;
1367 1367                          else {
1368 1368                                  is->is_maxdwin = is->is_maxswin * 2;
1369 1369                                  is->is_dend = ntohl(tcp->th_ack);
1370 1370                                  is->is_maxdend = ntohl(tcp->th_ack);
1371 1371                                  is->is_maxdwin *= 2;
1372 1372                          }
1373 1373                  }
1374 1374  
1375 1375                  /*
1376 1376                   * If we're creating state for a starting connection, start the
1377 1377                   * timer on it as we'll never see an error if it fails to
1378 1378                   * connect.
1379 1379                   */
1380 1380                  ATOMIC_INCL(ifs->ifs_ips_stats.iss_tcp);
1381 1381                  break;
1382 1382  
1383 1383          case IPPROTO_UDP :
1384 1384                  tcp = fin->fin_dp;
1385 1385  
1386 1386                  is->is_sport = htons(fin->fin_data[0]);
1387 1387                  is->is_dport = htons(fin->fin_data[1]);
1388 1388                  if ((flags & (SI_W_DPORT|SI_W_SPORT)) == 0) {
1389 1389                          hv += tcp->th_dport;
1390 1390                          hv += tcp->th_sport;
1391 1391                  }
1392 1392                  ATOMIC_INCL(ifs->ifs_ips_stats.iss_udp);
1393 1393                  break;
1394 1394  
1395 1395          default :
1396 1396                  break;
1397 1397          }
1398 1398          hv = DOUBLE_HASH(hv, ifs);
1399 1399          is->is_hv = hv;
1400 1400          is->is_rule = fr;
1401 1401          is->is_flags = flags & IS_INHERITED;
1402 1402  
1403 1403          /*
1404 1404           * Look for identical state.
1405 1405           */
1406 1406          for (is = ifs->ifs_ips_table[is->is_hv % ifs->ifs_fr_statesize];
1407 1407               is != NULL;
1408 1408               is = is->is_hnext) {
1409 1409                  if (fr_matchstates(&ips, is) == 1)
1410 1410                          break;
1411 1411          }
1412 1412  
1413 1413          /*
1414 1414           * we've found a matching state -> state already exists,
1415 1415           * we are not going to add a duplicate record.
1416 1416           */
1417 1417          if (is != NULL)
1418 1418                  return NULL;
1419 1419  
1420 1420          if (ifs->ifs_ips_stats.iss_bucketlen[hv] >= ifs->ifs_fr_state_maxbucket) {
1421 1421                  ATOMIC_INCL(ifs->ifs_ips_stats.iss_bucketfull);
1422 1422                  return NULL;
1423 1423          }
1424 1424          KMALLOC(is, ipstate_t *);
1425 1425          if (is == NULL) {
1426 1426                  ATOMIC_INCL(ifs->ifs_ips_stats.iss_nomem);
1427 1427                  return NULL;
1428 1428          }
1429 1429          bcopy((char *)&ips, (char *)is, sizeof(*is));
1430 1430          /*
1431 1431           * Do not do the modulous here, it is done in fr_stinsert().
1432 1432           */
1433 1433          if (fr != NULL) {
1434 1434                  (void) strncpy(is->is_group, fr->fr_group, FR_GROUPLEN);
1435 1435                  if (fr->fr_age[0] != 0) {
1436 1436                          is->is_tqehead[0] = 
1437 1437                              fr_addtimeoutqueue(&ifs->ifs_ips_utqe,
1438 1438                                                 fr->fr_age[0], ifs);
1439 1439                          is->is_sti.tqe_flags |= TQE_RULEBASED;
1440 1440                  }
1441 1441                  if (fr->fr_age[1] != 0) {
1442 1442                          is->is_tqehead[1] = 
1443 1443                              fr_addtimeoutqueue(&ifs->ifs_ips_utqe,
1444 1444                                                 fr->fr_age[1], ifs);
1445 1445                          is->is_sti.tqe_flags |= TQE_RULEBASED;
1446 1446                  }
1447 1447                  is->is_tag = fr->fr_logtag;
1448 1448  
1449 1449                  is->is_ifp[(out << 1) + 1] = fr->fr_ifas[1];
1450 1450                  is->is_ifp[(1 - out) << 1] = fr->fr_ifas[2];
1451 1451                  is->is_ifp[((1 - out) << 1) + 1] = fr->fr_ifas[3];
1452 1452  
1453 1453                  if (((ifp = fr->fr_ifas[1]) != NULL) &&
1454 1454                      (ifp != (void *)-1)) {
1455 1455                          COPYIFNAME(ifp, is->is_ifname[(out << 1) + 1], fr->fr_v);
1456 1456                  }
1457 1457                  if (((ifp = fr->fr_ifas[2]) != NULL) &&
1458 1458                      (ifp != (void *)-1)) {
1459 1459                          COPYIFNAME(ifp, is->is_ifname[(1 - out) << 1], fr->fr_v);
1460 1460                  }
1461 1461                  if (((ifp = fr->fr_ifas[3]) != NULL) &&
1462 1462                      (ifp != (void *)-1)) {
1463 1463                          COPYIFNAME(ifp, is->is_ifname[((1 - out) << 1) + 1], fr->fr_v);
1464 1464                  }
1465 1465          }
1466 1466  
1467 1467          is->is_ifp[out << 1] = fin->fin_ifp;
1468 1468          if (fin->fin_ifp != NULL) {
1469 1469                  COPYIFNAME(fin->fin_ifp, is->is_ifname[out << 1], fin->fin_v);
1470 1470          }
1471 1471  
1472 1472          is->is_ref = 1;
1473 1473          is->is_pkts[0] = 0, is->is_bytes[0] = 0;
1474 1474          is->is_pkts[1] = 0, is->is_bytes[1] = 0;
1475 1475          is->is_pkts[2] = 0, is->is_bytes[2] = 0;
1476 1476          is->is_pkts[3] = 0, is->is_bytes[3] = 0;
1477 1477          if ((fin->fin_flx & FI_IGNORE) == 0) {
1478 1478                  is->is_pkts[out] = 1;
1479 1479                  is->is_bytes[out] = fin->fin_plen;
1480 1480                  is->is_flx[out][0] = fin->fin_flx & FI_CMP;
1481 1481                  is->is_flx[out][0] &= ~FI_OOW;
1482 1482          }
1483 1483  
1484 1484          if (pass & FR_STSTRICT)
1485 1485                  is->is_flags |= IS_STRICT;
1486 1486  
1487 1487          if (pass & FR_STATESYNC)
1488 1488                  is->is_flags |= IS_STATESYNC;
1489 1489  
1490 1490          if (flags & (SI_WILDP|SI_WILDA)) {
1491 1491                  ATOMIC_INCL(ifs->ifs_ips_stats.iss_wild);
1492 1492          }
1493 1493          is->is_rulen = fin->fin_rule;
1494 1494  
1495 1495  
1496 1496          if (pass & FR_LOGFIRST)
1497 1497                  is->is_pass &= ~(FR_LOGFIRST|FR_LOG);
1498 1498  
1499 1499          READ_ENTER(&ifs->ifs_ipf_state);
1500 1500          is->is_me = stsave;
1501 1501  
1502 1502          fr_stinsert(is, fin->fin_rev, ifs);
1503 1503  
1504 1504          if (fin->fin_p == IPPROTO_TCP) {
1505 1505                  /*
1506 1506                  * If we're creating state for a starting connection, start the
1507 1507                  * timer on it as we'll never see an error if it fails to
1508 1508                  * connect.
1509 1509                  */
1510 1510                  (void) fr_tcp_age(&is->is_sti, fin, ifs->ifs_ips_tqtqb,
1511 1511                                    is->is_flags);
1512 1512                  MUTEX_EXIT(&is->is_lock);
1513 1513  #ifdef  IPFILTER_SCAN
1514 1514                  if ((is->is_flags & SI_CLONE) == 0)
1515 1515                          (void) ipsc_attachis(is);
1516 1516  #endif
1517 1517          } else {
1518 1518                  MUTEX_EXIT(&is->is_lock);
1519 1519          }
1520 1520  #ifdef  IPFILTER_SYNC
1521 1521          if ((is->is_flags & IS_STATESYNC) && ((is->is_flags & SI_CLONE) == 0))
1522 1522                  is->is_sync = ipfsync_new(SMC_STATE, fin, is);
1523 1523  #endif
1524 1524          if (ifs->ifs_ipstate_logging)
1525 1525                  ipstate_log(is, ISL_NEW, ifs);
1526 1526  
1527 1527          RWLOCK_EXIT(&ifs->ifs_ipf_state);
1528 1528          fin->fin_rev = IP6_NEQ(&is->is_dst, &fin->fin_daddr);
1529 1529          fin->fin_flx |= FI_STATE;
1530 1530          if (fin->fin_flx & FI_FRAG)
1531 1531                  (void) fr_newfrag(fin, pass ^ FR_KEEPSTATE);
1532 1532  
1533 1533          return is;
1534 1534  }
1535 1535  
1536 1536  
1537 1537  /* ------------------------------------------------------------------------ */
1538 1538  /* Function:    fr_tcpoptions                                               */
1539 1539  /* Returns:     int - 1 == packet matches state entry, 0 == it does not     */
1540 1540  /* Parameters:  fin(I) - pointer to packet information                      */
1541 1541  /*              tcp(I) - pointer to TCP packet header                       */
1542 1542  /*              td(I)  - pointer to TCP data held as part of the state      */
1543 1543  /*                                                                          */
1544 1544  /* Look after the TCP header for any options and deal with those that are   */
1545 1545  /* present.  Record details about those that we recogise.                   */
1546 1546  /* ------------------------------------------------------------------------ */
1547 1547  static int fr_tcpoptions(fin, tcp, td)
1548 1548  fr_info_t *fin;
1549 1549  tcphdr_t *tcp;
1550 1550  tcpdata_t *td;
1551 1551  {
1552 1552          int off, mlen, ol, i, len, retval;
1553 1553          char buf[64], *s, opt;
1554 1554          mb_t *m = NULL;
1555 1555  
1556 1556          len = (TCP_OFF(tcp) << 2);
1557 1557          if (fin->fin_dlen < len)
1558 1558                  return 0;
1559 1559          len -= sizeof(*tcp);
1560 1560  
1561 1561          off = fin->fin_plen - fin->fin_dlen + sizeof(*tcp) + fin->fin_ipoff;
1562 1562  
1563 1563          m = fin->fin_m;
1564 1564          mlen = MSGDSIZE(m) - off;
1565 1565          if (len > mlen) {
1566 1566                  len = mlen;
1567 1567                  retval = 0;
1568 1568          } else {
1569 1569                  retval = 1;
1570 1570          }
1571 1571  
1572 1572          COPYDATA(m, off, len, buf);
1573 1573  
1574 1574          for (s = buf; len > 0; ) {
1575 1575                  opt = *s;
1576 1576                  if (opt == TCPOPT_EOL)
1577 1577                          break;
1578 1578                  else if (opt == TCPOPT_NOP)
1579 1579                          ol = 1;
1580 1580                  else {
1581 1581                          if (len < 2)
1582 1582                                  break;
1583 1583                          ol = (int)*(s + 1);
1584 1584                          if (ol < 2 || ol > len)
1585 1585                                  break;
1586 1586  
1587 1587                          /*
1588 1588                           * Extract the TCP options we are interested in out of
1589 1589                           * the header and store them in the the tcpdata struct.
1590 1590                           */
1591 1591                          switch (opt)
1592 1592                          {
1593 1593                          case TCPOPT_WINDOW :
1594 1594                                  if (ol == TCPOLEN_WINDOW) {
1595 1595                                          i = (int)*(s + 2);
1596 1596                                          if (i > TCP_WSCALE_MAX)
1597 1597                                                  i = TCP_WSCALE_MAX;
1598 1598                                          else if (i < 0)
1599 1599                                                  i = 0;
1600 1600                                          td->td_winscale = i;
1601 1601                                          td->td_winflags |= TCP_WSCALE_SEEN |
1602 1602                                                              TCP_WSCALE_FIRST;
1603 1603                                  } else
1604 1604                                          retval = -1;
1605 1605                                  break;
1606 1606                          case TCPOPT_MAXSEG :
1607 1607                                  /*
1608 1608                                   * So, if we wanted to set the TCP MAXSEG,
1609 1609                                   * it should be done here...
1610 1610                                   */
1611 1611                                  if (ol == TCPOLEN_MAXSEG) {
1612 1612                                          i = (int)*(s + 2);
1613 1613                                          i <<= 8;
1614 1614                                          i += (int)*(s + 3);
1615 1615                                          td->td_maxseg = i;
1616 1616                                  } else
1617 1617                                          retval = -1;
1618 1618                                  break;
1619 1619                          case TCPOPT_SACK_PERMITTED :
1620 1620                                  if (ol == TCPOLEN_SACK_PERMITTED)
1621 1621                                          td->td_winflags |= TCP_SACK_PERMIT;
1622 1622                                  else
1623 1623                                          retval = -1;
1624 1624                                  break;
1625 1625                          }
1626 1626                  }
1627 1627                  len -= ol;
1628 1628                  s += ol;
1629 1629          }
1630 1630          return retval;
1631 1631  }
1632 1632  
1633 1633  
1634 1634  /* ------------------------------------------------------------------------ */
1635 1635  /* Function:    fr_tcpstate                                                 */
1636 1636  /* Returns:     int - 1 == packet matches state entry, 0 == it does not     */
1637 1637  /* Parameters:  fin(I)   - pointer to packet information                    */
1638 1638  /*              tcp(I)   - pointer to TCP packet header                     */
1639 1639  /*              is(I)  - pointer to master state structure                  */
1640 1640  /*                                                                          */
1641 1641  /* Check to see if a packet with TCP headers fits within the TCP window.    */
1642 1642  /* Change timeout depending on whether new packet is a SYN-ACK returning    */
1643 1643  /* for a SYN or a RST or FIN which indicate time to close up shop.          */
1644 1644  /* ------------------------------------------------------------------------ */
1645 1645  static int fr_tcpstate(fin, tcp, is)
1646 1646  fr_info_t *fin;
1647 1647  tcphdr_t *tcp;
1648 1648  ipstate_t *is;
1649 1649  {
1650 1650          int source, ret = 0, flags;
1651 1651          tcpdata_t  *fdata, *tdata;
1652 1652          ipf_stack_t *ifs = fin->fin_ifs;
1653 1653  
1654 1654          source = !fin->fin_rev;
1655 1655          if (((is->is_flags & IS_TCPFSM) != 0) && (source == 1) && 
1656 1656              (ntohs(is->is_sport) != fin->fin_data[0]))
1657 1657                  source = 0;
1658 1658          fdata = &is->is_tcp.ts_data[!source];
1659 1659          tdata = &is->is_tcp.ts_data[source];
1660 1660  
1661 1661          MUTEX_ENTER(&is->is_lock);
1662 1662  
1663 1663          /*
1664 1664           * If a SYN packet is received for a connection that is in a half
1665 1665           * closed state, then move its state entry to deletetq. In such case
1666 1666           * the SYN packet will be consequently dropped. This allows new state
1667 1667           * entry to be created with a retransmited SYN packet.
1668 1668           */
1669 1669          if ((tcp->th_flags & TH_OPENING) == TH_SYN) {
1670 1670                  if ((is->is_state[source] > IPF_TCPS_ESTABLISHED) &&
1671 1671                      (is->is_state[!source] > IPF_TCPS_ESTABLISHED)) {
1672 1672                          is->is_state[source] = IPF_TCPS_CLOSED;
1673 1673                          is->is_state[!source] = IPF_TCPS_CLOSED;
1674 1674                          /*
1675 1675                           * Do not update is->is_sti.tqe_die in case state entry
1676 1676                           * is already present in deletetq. It prevents state
1677 1677                           * entry ttl update by retransmitted SYN packets, which
1678 1678                           * may arrive before timer tick kicks off. The SYN
1679 1679                           * packet will be dropped again.
1680 1680                           */
1681 1681                          if (is->is_sti.tqe_ifq != &ifs->ifs_ips_deletetq)
1682 1682                                  fr_movequeue(&is->is_sti, is->is_sti.tqe_ifq,
1683 1683                                          &fin->fin_ifs->ifs_ips_deletetq,
1684 1684                                          fin->fin_ifs);
1685 1685  
1686 1686                          MUTEX_EXIT(&is->is_lock);
1687 1687                          return 0;
1688 1688                  }
1689 1689          }
1690 1690  
1691 1691          if (fr_tcpinwindow(fin, fdata, tdata, tcp, is->is_flags)) {
1692 1692  #ifdef  IPFILTER_SCAN
1693 1693                  if (is->is_flags & (IS_SC_CLIENT|IS_SC_SERVER)) {
1694 1694                          ipsc_packet(fin, is);
1695 1695                          if (FR_ISBLOCK(is->is_pass)) {
1696 1696                                  MUTEX_EXIT(&is->is_lock);
1697 1697                                  return 1;
1698 1698                          }
1699 1699                  }
1700 1700  #endif
1701 1701  
1702 1702                  /*
1703 1703                   * Nearing end of connection, start timeout.
1704 1704                   */
1705 1705                  ret = fr_tcp_age(&is->is_sti, fin, ifs->ifs_ips_tqtqb,
1706 1706                                   is->is_flags);
1707 1707                  if (ret == 0) {
1708 1708                          MUTEX_EXIT(&is->is_lock);
1709 1709                          return 0;
1710 1710                  }
1711 1711  
1712 1712                  /*
1713 1713                   * set s0's as appropriate.  Use syn-ack packet as it
1714 1714                   * contains both pieces of required information.
1715 1715                   */
1716 1716                  /*
1717 1717                   * Window scale option is only present in SYN/SYN-ACK packet.
1718 1718                   * Compare with ~TH_FIN to mask out T/TCP setups.
1719 1719                   */
1720 1720                  flags = tcp->th_flags & ~(TH_FIN|TH_ECNALL);
1721 1721                  if (flags == (TH_SYN|TH_ACK)) {
  
    | 
      ↓ open down ↓ | 
    1721 lines elided | 
    
      ↑ open up ↑ | 
  
1722 1722                          is->is_s0[source] = ntohl(tcp->th_ack);
1723 1723                          is->is_s0[!source] = ntohl(tcp->th_seq) + 1;
1724 1724                          if (TCP_OFF(tcp) > (sizeof (tcphdr_t) >> 2)) {
1725 1725                                  (void) fr_tcpoptions(fin, tcp, fdata);
1726 1726                          }
1727 1727                          if ((fin->fin_out != 0) && (is->is_pass & FR_NEWISN))
1728 1728                                  fr_checknewisn(fin, is);
1729 1729                  } else if (flags == TH_SYN) {
1730 1730                          is->is_s0[source] = ntohl(tcp->th_seq) + 1;
1731 1731                          if ((TCP_OFF(tcp) > (sizeof(tcphdr_t) >> 2)))
1732      -                                (void) fr_tcpoptions(fin, tcp, tdata);
     1732 +                                (void) fr_tcpoptions(fin, tcp, fdata);
1733 1733  
1734 1734                          if ((fin->fin_out != 0) && (is->is_pass & FR_NEWISN))
1735 1735                                  fr_checknewisn(fin, is);
1736 1736  
1737 1737                  }
1738 1738                  ret = 1;
1739 1739          } else
1740 1740                  fin->fin_flx |= FI_OOW;
1741 1741          MUTEX_EXIT(&is->is_lock);
1742 1742          return ret;
1743 1743  }
1744 1744  
1745 1745  
1746 1746  /* ------------------------------------------------------------------------ */
1747 1747  /* Function:    fr_checknewisn                                              */
1748 1748  /* Returns:     Nil                                                         */
1749 1749  /* Parameters:  fin(I)   - pointer to packet information                    */
1750 1750  /*              is(I)  - pointer to master state structure                  */
1751 1751  /*                                                                          */
1752 1752  /* Check to see if this TCP connection is expecting and needs a new         */
1753 1753  /* sequence number for a particular direction of the connection.            */
1754 1754  /*                                                                          */
1755 1755  /* NOTE: This does not actually change the sequence numbers, only gets new  */
1756 1756  /* one ready.                                                               */
1757 1757  /* ------------------------------------------------------------------------ */
1758 1758  static void fr_checknewisn(fin, is)
1759 1759  fr_info_t *fin;
1760 1760  ipstate_t *is;
1761 1761  {
1762 1762          u_32_t sumd, old, new;
1763 1763          tcphdr_t *tcp;
1764 1764          int i;
1765 1765  
1766 1766          i = fin->fin_rev;
1767 1767          tcp = fin->fin_dp;
1768 1768  
1769 1769          if (((i == 0) && !(is->is_flags & IS_ISNSYN)) ||
1770 1770              ((i == 1) && !(is->is_flags & IS_ISNACK))) {
1771 1771                  old = ntohl(tcp->th_seq);
1772 1772                  new = fr_newisn(fin);
1773 1773                  is->is_isninc[i] = new - old;
1774 1774                  CALC_SUMD(old, new, sumd);
1775 1775                  is->is_sumd[i] = (sumd & 0xffff) + (sumd >> 16);
1776 1776  
1777 1777                  is->is_flags |= ((i == 0) ? IS_ISNSYN : IS_ISNACK);
1778 1778          }
1779 1779  }
1780 1780  
1781 1781  
1782 1782  /* ------------------------------------------------------------------------ */
1783 1783  /* Function:    fr_tcpinwindow                                              */
1784 1784  /* Returns:     int - 1 == packet inside TCP "window", 0 == not inside.     */
1785 1785  /* Parameters:  fin(I)   - pointer to packet information                    */
1786 1786  /*              fdata(I) - pointer to tcp state informatio (forward)        */
1787 1787  /*              tdata(I) - pointer to tcp state informatio (reverse)        */
1788 1788  /*              tcp(I)   - pointer to TCP packet header                     */
1789 1789  /*                                                                          */
1790 1790  /* Given a packet has matched addresses and ports, check to see if it is    */
1791 1791  /* within the TCP data window.  In a show of generosity, allow packets that */
1792 1792  /* are within the window space behind the current sequence # as well.       */
1793 1793  /* ------------------------------------------------------------------------ */
1794 1794  int fr_tcpinwindow(fin, fdata, tdata, tcp, flags)
1795 1795  fr_info_t *fin;
1796 1796  tcpdata_t  *fdata, *tdata;
1797 1797  tcphdr_t *tcp;
1798 1798  int flags;
1799 1799  {
1800 1800          tcp_seq seq, ack, end;
1801 1801          int ackskew, tcpflags;
1802 1802          u_32_t win, maxwin;
1803 1803          int dsize, inseq;
1804 1804  
1805 1805          /*
1806 1806           * Find difference between last checked packet and this packet.
1807 1807           */
1808 1808          tcpflags = tcp->th_flags;
1809 1809          seq = ntohl(tcp->th_seq);
1810 1810          ack = ntohl(tcp->th_ack);
1811 1811  
1812 1812          if (tcpflags & TH_SYN)
1813 1813                  win = ntohs(tcp->th_win);
1814 1814          else
1815 1815                  win = ntohs(tcp->th_win) << fdata->td_winscale;
1816 1816  
1817 1817          /*
1818 1818           * win 0 means the receiving endpoint has closed the window, because it
1819 1819           * has not enough memory to receive data from sender. In such case we
1820 1820           * are pretending window size to be 1 to let TCP probe data through.
1821 1821           * TCP probe data can be either 0 or 1 octet of data, the RFC does not
1822 1822           * state this accurately, so we have to allow 1 octet (win = 1) even if
1823 1823           * the window is closed (win == 0).
1824 1824           */
1825 1825          if (win == 0)
1826 1826                  win = 1;
1827 1827  
1828 1828          dsize = fin->fin_dlen - (TCP_OFF(tcp) << 2) +
1829 1829                  ((tcpflags & TH_SYN) ? 1 : 0) + ((tcpflags & TH_FIN) ? 1 : 0);
1830 1830  
1831 1831          /*
1832 1832           * if window scaling is present, the scaling is only allowed
  
    | 
      ↓ open down ↓ | 
    90 lines elided | 
    
      ↑ open up ↑ | 
  
1833 1833           * for windows not in the first SYN packet. In that packet the
1834 1834           * window is 65535 to specify the largest window possible
1835 1835           * for receivers not implementing the window scale option.
1836 1836           * Currently, we do not assume TTCP here. That means that
1837 1837           * if we see a second packet from a host (after the initial
1838 1838           * SYN), we can assume that the receiver of the SYN did
1839 1839           * already send back the SYN/ACK (and thus that we know if
1840 1840           * the receiver also does window scaling)
1841 1841           */
1842 1842          if (!(tcpflags & TH_SYN) && (fdata->td_winflags & TCP_WSCALE_FIRST)) {
     1843 +                fdata->td_winflags &= ~TCP_WSCALE_FIRST;
1843 1844                  fdata->td_maxwin = win;
1844 1845          }
1845 1846  
1846 1847          end = seq + dsize;
1847 1848  
1848 1849          if ((fdata->td_end == 0) &&
1849 1850              (!(flags & IS_TCPFSM) ||
1850 1851               ((tcpflags & TH_OPENING) == TH_OPENING))) {
1851 1852                  /*
1852 1853                   * Must be a (outgoing) SYN-ACK in reply to a SYN.
1853 1854                   */
1854 1855                  fdata->td_end = end - 1;
1855 1856                  fdata->td_maxwin = 1;
1856 1857                  fdata->td_maxend = end + win;
1857 1858          }
1858 1859  
1859 1860          if (!(tcpflags & TH_ACK)) {  /* Pretend an ack was sent */
1860 1861                  ack = tdata->td_end;
1861 1862          } else if (((tcpflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) &&
1862 1863                     (ack == 0)) {
1863 1864                  /* gross hack to get around certain broken tcp stacks */
1864 1865                  ack = tdata->td_end;
1865 1866          }
1866 1867  
1867 1868          maxwin = tdata->td_maxwin;
1868 1869          ackskew = tdata->td_end - ack;
1869 1870  
1870 1871          /*
1871 1872           * Strict sequencing only allows in-order delivery.
1872 1873           */
1873 1874          if ((flags & IS_STRICT) != 0) {
1874 1875                  if (seq != fdata->td_end) {
1875 1876                          DTRACE_PROBE(strict_check);
1876 1877                          return 0;
1877 1878                  }
1878 1879          }
1879 1880  
1880 1881  #define SEQ_GE(a,b)     ((int)((a) - (b)) >= 0)
1881 1882  #define SEQ_GT(a,b)     ((int)((a) - (b)) > 0)
1882 1883          inseq = 0;
1883 1884          DTRACE_PROBE4(
1884 1885                  dyn_params,
1885 1886                  int, dsize,
1886 1887                  int, ackskew,
1887 1888                  int, maxwin,
1888 1889                  int, win
1889 1890          );
1890 1891          if (
1891 1892  #if defined(_KERNEL)
1892 1893                  /* 
1893 1894                   * end <-> s + n
1894 1895                   * maxend <-> ack + win
  
    | 
      ↓ open down ↓ | 
    42 lines elided | 
    
      ↑ open up ↑ | 
  
1895 1896                   * this is upperbound check
1896 1897                   */
1897 1898              (SEQ_GE(fdata->td_maxend, end)) &&
1898 1899                  /*
1899 1900                   * this is lowerbound check
1900 1901                   */
1901 1902              (SEQ_GE(seq, fdata->td_end - maxwin)) &&
1902 1903  #endif
1903 1904  /* XXX what about big packets */
1904 1905  #define MAXACKWINDOW 66000
1905      -            (-ackskew <= (MAXACKWINDOW << fdata->td_winscale)) &&
     1906 +            (-ackskew <= (MAXACKWINDOW)) &&
1906 1907              ( ackskew <= (MAXACKWINDOW << fdata->td_winscale))) {
1907 1908                  inseq = 1;
1908 1909          /*
1909 1910           * Microsoft Windows will send the next packet to the right of the
1910 1911           * window if SACK is in use.
1911 1912           */
1912 1913          } else if ((seq == fdata->td_maxend) && (ackskew == 0) &&
1913 1914              (fdata->td_winflags & TCP_SACK_PERMIT) &&
1914 1915              (tdata->td_winflags & TCP_SACK_PERMIT)) {
1915 1916                  inseq = 1;
1916 1917          /*
1917 1918           * RST ACK with SEQ equal to 0 is sent by some OSes (i.e. Solaris) as a
1918 1919           * response to initial SYN packet, when  there is no application
1919 1920           * listeing to on a port, where the SYN packet has came to.
1920 1921           */
1921 1922          } else if ((seq == 0) && (tcpflags == (TH_RST|TH_ACK)) &&
1922 1923                          (ackskew >= -1) && (ackskew <= 1)) {
1923 1924                  inseq = 1;
1924 1925          } else if (!(flags & IS_TCPFSM)) {
1925 1926  
1926 1927                  if (!(fdata->td_winflags &
1927 1928                              (TCP_WSCALE_SEEN|TCP_WSCALE_FIRST))) {
1928 1929                          /*
1929 1930                           * No TCPFSM and no window scaling, so make some
1930 1931                           * extra guesses.
1931 1932                           */
1932 1933                          if ((seq == fdata->td_maxend) && (ackskew == 0))
1933 1934                                  inseq = 1;
1934 1935                          else if (SEQ_GE(seq + maxwin, fdata->td_end - maxwin))
1935 1936                                  inseq = 1;
1936 1937                  }
1937 1938          }
1938 1939  
1939 1940          if (inseq) {
1940 1941                  /* if ackskew < 0 then this should be due to fragmented
1941 1942                   * packets. There is no way to know the length of the
1942 1943                   * total packet in advance.
1943 1944                   * We do know the total length from the fragment cache though.
1944 1945                   * Note however that there might be more sessions with
1945 1946                   * exactly the same source and destination parameters in the
1946 1947                   * state cache (and source and destination is the only stuff
1947 1948                   * that is saved in the fragment cache). Note further that
1948 1949                   * some TCP connections in the state cache are hashed with
1949 1950                   * sport and dport as well which makes it not worthwhile to
1950 1951                   * look for them.
1951 1952                   * Thus, when ackskew is negative but still seems to belong
1952 1953                   * to this session, we bump up the destinations end value.
1953 1954                   */
1954 1955                  if (ackskew < 0) {
1955 1956                          DTRACE_PROBE2(end_update_td,
1956 1957                                  int, tdata->td_end,
1957 1958                                  int, ack
1958 1959                          );
1959 1960                          tdata->td_end = ack;
1960 1961                  }
1961 1962  
1962 1963                  /* update max window seen */
1963 1964                  if (fdata->td_maxwin < win) {
1964 1965                          DTRACE_PROBE2(win_update_fd,
1965 1966                                  int, fdata->td_maxwin,
1966 1967                                  int, win
1967 1968                          );
1968 1969                          fdata->td_maxwin = win;
1969 1970                  }
1970 1971  
1971 1972                  if (SEQ_GT(end, fdata->td_end)) {
1972 1973                          DTRACE_PROBE2(end_update_fd,
1973 1974                                  int, fdata->td_end,
1974 1975                                  int, end
1975 1976                          );
1976 1977                          fdata->td_end = end;
1977 1978                  }
1978 1979  
1979 1980                  if (SEQ_GE(ack + win, tdata->td_maxend)) {
1980 1981                          DTRACE_PROBE2(max_end_update_td,
1981 1982                                  int, tdata->td_maxend,
1982 1983                                  int, ack + win
1983 1984                          );
1984 1985                          tdata->td_maxend = ack + win;
1985 1986                  }
1986 1987  
1987 1988                  return 1;
1988 1989          }
1989 1990          fin->fin_flx |= FI_OOW;
1990 1991  
1991 1992  #if defined(_KERNEL)
1992 1993          if (!(SEQ_GE(seq, fdata->td_end - maxwin)))
1993 1994                  fin->fin_flx |= FI_NEG_OOW;
1994 1995  #endif
1995 1996  
1996 1997          return 0;
1997 1998  }
1998 1999  
1999 2000  
2000 2001  /* ------------------------------------------------------------------------ */
2001 2002  /* Function:    fr_stclone                                                  */
2002 2003  /* Returns:     ipstate_t* - NULL == cloning failed,                        */
2003 2004  /*                           else pointer to new state structure            */
2004 2005  /* Parameters:  fin(I) - pointer to packet information                      */
2005 2006  /*              tcp(I) - pointer to TCP/UDP header                          */
2006 2007  /*              is(I)  - pointer to master state structure                  */
2007 2008  /*                                                                          */
2008 2009  /* Create a "duplcate" state table entry from the master.                   */
2009 2010  /* ------------------------------------------------------------------------ */
2010 2011  static ipstate_t *fr_stclone(fin, tcp, is)
2011 2012  fr_info_t *fin;
2012 2013  tcphdr_t *tcp;
2013 2014  ipstate_t *is;
2014 2015  {
2015 2016          ipstate_t *clone;
2016 2017          u_32_t send;
2017 2018          ipf_stack_t *ifs = fin->fin_ifs;
2018 2019  
2019 2020          /*
2020 2021           * Trigger automatic call to fr_state_flush() if the
2021 2022           * table has reached capacity specified by hi watermark.
2022 2023           */
2023 2024          if (ST_TAB_WATER_LEVEL(ifs) > ifs->ifs_state_flush_level_hi)
2024 2025                  ifs->ifs_fr_state_doflush = 1;
2025 2026  
2026 2027          /*
2027 2028           * If automatic flushing did not do its job, and the table
2028 2029           * has filled up, don't try to create a new entry.  A NULL
2029 2030           * return will indicate that the cloning has failed.
2030 2031           */
2031 2032          if (ifs->ifs_ips_num >= ifs->ifs_fr_statemax) {
2032 2033                  ATOMIC_INCL(ifs->ifs_ips_stats.iss_max);
2033 2034                  return NULL;
2034 2035          }
2035 2036  
2036 2037          KMALLOC(clone, ipstate_t *);
2037 2038          if (clone == NULL)
2038 2039                  return NULL;
2039 2040          bcopy((char *)is, (char *)clone, sizeof(*clone));
2040 2041  
2041 2042          MUTEX_NUKE(&clone->is_lock);
2042 2043  
2043 2044          clone->is_die = ONE_DAY + ifs->ifs_fr_ticks;
2044 2045          clone->is_state[0] = 0;
2045 2046          clone->is_state[1] = 0;
2046 2047          send = ntohl(tcp->th_seq) + fin->fin_dlen - (TCP_OFF(tcp) << 2) +
2047 2048                  ((tcp->th_flags & TH_SYN) ? 1 : 0) +
2048 2049                  ((tcp->th_flags & TH_FIN) ? 1 : 0);
2049 2050  
2050 2051          if (fin->fin_rev == 1) {
2051 2052                  clone->is_dend = send;
2052 2053                  clone->is_maxdend = send;
2053 2054                  clone->is_send = 0;
2054 2055                  clone->is_maxswin = 1;
2055 2056                  clone->is_maxdwin = ntohs(tcp->th_win);
2056 2057                  if (clone->is_maxdwin == 0)
2057 2058                          clone->is_maxdwin = 1;
2058 2059          } else {
2059 2060                  clone->is_send = send;
2060 2061                  clone->is_maxsend = send;
2061 2062                  clone->is_dend = 0;
2062 2063                  clone->is_maxdwin = 1;
2063 2064                  clone->is_maxswin = ntohs(tcp->th_win);
2064 2065                  if (clone->is_maxswin == 0)
2065 2066                          clone->is_maxswin = 1;
2066 2067          }
2067 2068  
2068 2069          clone->is_flags &= ~SI_CLONE;
2069 2070          clone->is_flags |= SI_CLONED;
2070 2071          fr_stinsert(clone, fin->fin_rev, ifs);
2071 2072          clone->is_ref = 1;
2072 2073          if (clone->is_p == IPPROTO_TCP) {
2073 2074                  (void) fr_tcp_age(&clone->is_sti, fin, ifs->ifs_ips_tqtqb,
2074 2075                                    clone->is_flags);
2075 2076          }
2076 2077          MUTEX_EXIT(&clone->is_lock);
2077 2078  #ifdef  IPFILTER_SCAN
2078 2079          (void) ipsc_attachis(is);
2079 2080  #endif
2080 2081  #ifdef  IPFILTER_SYNC
2081 2082          if (is->is_flags & IS_STATESYNC)
2082 2083                  clone->is_sync = ipfsync_new(SMC_STATE, fin, clone);
2083 2084  #endif
2084 2085          return clone;
2085 2086  }
2086 2087  
2087 2088  
2088 2089  /* ------------------------------------------------------------------------ */
2089 2090  /* Function:    fr_matchsrcdst                                              */
2090 2091  /* Returns:     Nil                                                         */
2091 2092  /* Parameters:  fin(I) - pointer to packet information                      */
2092 2093  /*              is(I)  - pointer to state structure                         */
2093 2094  /*              src(I) - pointer to source address                          */
2094 2095  /*              dst(I) - pointer to destination address                     */
2095 2096  /*              tcp(I) - pointer to TCP/UDP header                          */
2096 2097  /*                                                                          */
2097 2098  /* Match a state table entry against an IP packet.  The logic below is that */
2098 2099  /* ret gets set to one if the match succeeds, else remains 0.  If it is     */
2099 2100  /* still 0 after the test. no match.                                        */
2100 2101  /* ------------------------------------------------------------------------ */
2101 2102  static ipstate_t *fr_matchsrcdst(fin, is, src, dst, tcp, cmask)
2102 2103  fr_info_t *fin;
2103 2104  ipstate_t *is;
2104 2105  i6addr_t *src, *dst;
2105 2106  tcphdr_t *tcp;
2106 2107  u_32_t cmask;
2107 2108  {
2108 2109          int ret = 0, rev, out, flags, flx = 0, idx;
2109 2110          u_short sp, dp;
2110 2111          u_32_t cflx;
2111 2112          void *ifp;
2112 2113          ipf_stack_t *ifs = fin->fin_ifs;
2113 2114  
2114 2115          rev = IP6_NEQ(&is->is_dst, dst);
2115 2116          ifp = fin->fin_ifp;
2116 2117          out = fin->fin_out;
2117 2118          flags = is->is_flags;
2118 2119          sp = 0;
2119 2120          dp = 0;
2120 2121  
2121 2122          if (tcp != NULL) {
2122 2123                  sp = htons(fin->fin_sport);
2123 2124                  dp = ntohs(fin->fin_dport);
2124 2125          }
2125 2126          if (!rev) {
2126 2127                  if (tcp != NULL) {
2127 2128                          if (!(flags & SI_W_SPORT) && (sp != is->is_sport))
2128 2129                                  rev = 1;
2129 2130                          else if (!(flags & SI_W_DPORT) && (dp != is->is_dport))
2130 2131                                  rev = 1;
2131 2132                  }
2132 2133          }
2133 2134  
2134 2135          idx = (out << 1) + rev;
2135 2136  
2136 2137          /*
2137 2138           * If the interface for this 'direction' is set, make sure it matches.
2138 2139           * An interface name that is not set matches any, as does a name of *.
2139 2140           */
2140 2141          if ((is->is_ifp[idx] == NULL &&
2141 2142              (*is->is_ifname[idx] == '\0' || *is->is_ifname[idx] == '*')) ||
2142 2143              is->is_ifp[idx] == ifp)
2143 2144                  ret = 1;
2144 2145  
2145 2146          if (ret == 0) {
2146 2147                  DTRACE_PROBE(no_match_on_iface);
2147 2148                  return NULL;
2148 2149          }
2149 2150          ret = 0;
2150 2151  
2151 2152          /*
2152 2153           * Match addresses and ports.
2153 2154           */
2154 2155          if (rev == 0) {
2155 2156                  if ((IP6_EQ(&is->is_dst, dst) || (flags & SI_W_DADDR)) &&
2156 2157                      (IP6_EQ(&is->is_src, src) || (flags & SI_W_SADDR))) {
2157 2158                          if (tcp) {
2158 2159                                  if ((sp == is->is_sport || flags & SI_W_SPORT)&&
2159 2160                                      (dp == is->is_dport || flags & SI_W_DPORT))
2160 2161                                          ret = 1;
2161 2162                          } else {
2162 2163                                  ret = 1;
2163 2164                          }
2164 2165                  }
2165 2166          } else {
2166 2167                  if ((IP6_EQ(&is->is_dst, src) || (flags & SI_W_DADDR)) &&
2167 2168                      (IP6_EQ(&is->is_src, dst) || (flags & SI_W_SADDR))) {
2168 2169                          if (tcp) {
2169 2170                                  if ((dp == is->is_sport || flags & SI_W_SPORT)&&
2170 2171                                      (sp == is->is_dport || flags & SI_W_DPORT))
2171 2172                                          ret = 1;
2172 2173                          } else {
2173 2174                                  ret = 1;
2174 2175                          }
2175 2176                  }
2176 2177          }
2177 2178  
2178 2179          if (ret == 0) {
2179 2180                  DTRACE_PROBE(no_match_on_addrs);
2180 2181                  return NULL;
2181 2182          }
2182 2183          /*
2183 2184           * Whether or not this should be here, is questionable, but the aim
2184 2185           * is to get this out of the main line.
2185 2186           */
2186 2187          if (tcp == NULL)
2187 2188                  flags = is->is_flags & ~(SI_WILDP|SI_NEWFR|SI_CLONE|SI_CLONED);
2188 2189  
2189 2190          /*
2190 2191           * Only one of the source or destination address can be flaged as a
2191 2192           * wildcard.  Fill in the missing address, if set.
2192 2193           * For IPv6, if the address being copied in is multicast, then
2193 2194           * don't reset the wild flag - multicast causes it to be set in the
2194 2195           * first place!
2195 2196           */
2196 2197          if ((flags & (SI_W_SADDR|SI_W_DADDR))) {
2197 2198                  fr_ip_t *fi = &fin->fin_fi;
2198 2199  
2199 2200                  if ((flags & SI_W_SADDR) != 0) {
2200 2201                          if (rev == 0) {
2201 2202  #ifdef USE_INET6
2202 2203                                  if (is->is_v == 6 &&
2203 2204                                      IN6_IS_ADDR_MULTICAST(&fi->fi_src.in6))
2204 2205                                          /*EMPTY*/;
2205 2206                                  else
2206 2207  #endif
2207 2208                                  {
2208 2209                                          is->is_src = fi->fi_src;
2209 2210                                          is->is_flags &= ~SI_W_SADDR;
2210 2211                                  }
2211 2212                          } else {
2212 2213  #ifdef USE_INET6
2213 2214                                  if (is->is_v == 6 &&
2214 2215                                      IN6_IS_ADDR_MULTICAST(&fi->fi_dst.in6))
2215 2216                                          /*EMPTY*/;
2216 2217                                  else
2217 2218  #endif
2218 2219                                  {
2219 2220                                          is->is_src = fi->fi_dst;
2220 2221                                          is->is_flags &= ~SI_W_SADDR;
2221 2222                                  }
2222 2223                          }
2223 2224                  } else if ((flags & SI_W_DADDR) != 0) {
2224 2225                          if (rev == 0) {
2225 2226  #ifdef USE_INET6
2226 2227                                  if (is->is_v == 6 &&
2227 2228                                      IN6_IS_ADDR_MULTICAST(&fi->fi_dst.in6))
2228 2229                                          /*EMPTY*/;
2229 2230                                  else
2230 2231  #endif
2231 2232                                  {
2232 2233                                          is->is_dst = fi->fi_dst;
2233 2234                                          is->is_flags &= ~SI_W_DADDR;
2234 2235                                  }
2235 2236                          } else {
2236 2237  #ifdef USE_INET6
2237 2238                                  if (is->is_v == 6 &&
2238 2239                                      IN6_IS_ADDR_MULTICAST(&fi->fi_src.in6))
2239 2240                                          /*EMPTY*/;
2240 2241                                  else
2241 2242  #endif
2242 2243                                  {
2243 2244                                          is->is_dst = fi->fi_src;
2244 2245                                          is->is_flags &= ~SI_W_DADDR;
2245 2246                                  }
2246 2247                          }
2247 2248                  }
2248 2249                  if ((is->is_flags & (SI_WILDA|SI_WILDP)) == 0) {
2249 2250                          ATOMIC_DECL(ifs->ifs_ips_stats.iss_wild);
2250 2251                  }
2251 2252          }
2252 2253  
2253 2254          flx = fin->fin_flx & cmask;
2254 2255          cflx = is->is_flx[out][rev];
2255 2256  
2256 2257          /*
2257 2258           * Match up any flags set from IP options.
2258 2259           */
2259 2260          if ((cflx && (flx != (cflx & cmask))) ||
2260 2261              ((fin->fin_optmsk & is->is_optmsk[rev]) != is->is_opt[rev]) ||
2261 2262              ((fin->fin_secmsk & is->is_secmsk) != is->is_sec) ||
2262 2263              ((fin->fin_auth & is->is_authmsk) != is->is_auth)) {
2263 2264                  DTRACE_PROBE4(no_match_on_flags,
2264 2265                      int, (cflx && (flx != (cflx & cmask))),
2265 2266                      int,
2266 2267                      ((fin->fin_optmsk & is->is_optmsk[rev]) != is->is_opt[rev]),
2267 2268                      int, ((fin->fin_secmsk & is->is_secmsk) != is->is_sec),
2268 2269                      int, ((fin->fin_auth & is->is_authmsk) != is->is_auth)
2269 2270                  );
2270 2271                  return NULL;
2271 2272          }
2272 2273          /*
2273 2274           * Only one of the source or destination port can be flagged as a
2274 2275           * wildcard.  When filling it in, fill in a copy of the matched entry
2275 2276           * if it has the cloning flag set.
2276 2277           */
2277 2278          if ((fin->fin_flx & FI_IGNORE) != 0) {
2278 2279                  fin->fin_rev = rev;
2279 2280                  return is;
2280 2281          }
2281 2282  
2282 2283          if ((flags & (SI_W_SPORT|SI_W_DPORT))) {
2283 2284                  if ((flags & SI_CLONE) != 0) {
2284 2285                          ipstate_t *clone;
2285 2286  
2286 2287                          clone = fr_stclone(fin, tcp, is);
2287 2288                          if (clone == NULL)
2288 2289                                  return NULL;
2289 2290                          is = clone;
2290 2291                  } else {
2291 2292                          ATOMIC_DECL(ifs->ifs_ips_stats.iss_wild);
2292 2293                  }
2293 2294  
2294 2295                  if ((flags & SI_W_SPORT) != 0) {
2295 2296                          if (rev == 0) {
2296 2297                                  is->is_sport = sp;
2297 2298                                  is->is_send = ntohl(tcp->th_seq);
2298 2299                          } else {
2299 2300                                  is->is_sport = dp;
2300 2301                                  is->is_send = ntohl(tcp->th_ack);
2301 2302                          }
2302 2303                          is->is_maxsend = is->is_send + 1;
2303 2304                  } else if ((flags & SI_W_DPORT) != 0) {
2304 2305                          if (rev == 0) {
2305 2306                                  is->is_dport = dp;
2306 2307                                  is->is_dend = ntohl(tcp->th_ack);
2307 2308                          } else {
2308 2309                                  is->is_dport = sp;
2309 2310                                  is->is_dend = ntohl(tcp->th_seq);
2310 2311                          }
2311 2312                          is->is_maxdend = is->is_dend + 1;
2312 2313                  }
2313 2314                  is->is_flags &= ~(SI_W_SPORT|SI_W_DPORT);
2314 2315                  if ((flags & SI_CLONED) && ifs->ifs_ipstate_logging)
2315 2316                          ipstate_log(is, ISL_CLONE, ifs);
2316 2317          }
2317 2318  
2318 2319          ret = -1;
2319 2320  
2320 2321          if (is->is_flx[out][rev] == 0) {
2321 2322                  is->is_flx[out][rev] = flx;
2322 2323                  /*
2323 2324                   * If we are dealing with the first packet coming in reverse
2324 2325                   * direction (sent by peer), then we have to set options into
2325 2326                   * state.
2326 2327                   */
2327 2328                  if (rev == 1 && is->is_optmsk[1] == 0x0) {
2328 2329                          is->is_optmsk[1] = 0xffffffff;
2329 2330                          is->is_opt[1] = fin->fin_optmsk;
2330 2331                          DTRACE_PROBE(set_rev_opts);
2331 2332                  }
2332 2333                  if (is->is_v == 6) {
2333 2334                          is->is_opt[rev] &= ~0x8;
2334 2335                          is->is_optmsk[rev] &= ~0x8;
2335 2336                  }
2336 2337          }
2337 2338  
2338 2339          /*
2339 2340           * Check if the interface name for this "direction" is set and if not,
2340 2341           * fill it in.
2341 2342           */
2342 2343          if (is->is_ifp[idx] == NULL &&
2343 2344              (*is->is_ifname[idx] == '\0' || *is->is_ifname[idx] == '*')) {
2344 2345                  is->is_ifp[idx] = ifp;
2345 2346                  COPYIFNAME(ifp, is->is_ifname[idx], fin->fin_v);
2346 2347          }
2347 2348          fin->fin_rev = rev;
2348 2349          return is;
2349 2350  }
2350 2351  
2351 2352  
2352 2353  /* ------------------------------------------------------------------------ */
2353 2354  /* Function:    fr_checkicmpmatchingstate                                   */
2354 2355  /* Returns:     Nil                                                         */
2355 2356  /* Parameters:  fin(I) - pointer to packet information                      */
2356 2357  /*                                                                          */
2357 2358  /* If we've got an ICMP error message, using the information stored in the  */
2358 2359  /* ICMP packet, look for a matching state table entry.                      */
2359 2360  /*                                                                          */
2360 2361  /* If we return NULL then no lock on ipf_state is held.                     */
2361 2362  /* If we return non-null then a read-lock on ipf_state is held.             */
2362 2363  /* ------------------------------------------------------------------------ */
2363 2364  static ipstate_t *fr_checkicmpmatchingstate(fin)
2364 2365  fr_info_t *fin;
2365 2366  {
2366 2367          ipstate_t *is, **isp;
2367 2368          u_short sport, dport;
2368 2369          u_char  pr;
2369 2370          int backward, i, oi;
2370 2371          i6addr_t dst, src;
2371 2372          struct icmp *ic;
2372 2373          u_short savelen;
2373 2374          icmphdr_t *icmp;
2374 2375          fr_info_t ofin;
2375 2376          tcphdr_t *tcp;
2376 2377          int len;
2377 2378          ip_t *oip;
2378 2379          u_int hv;
2379 2380          ipf_stack_t *ifs = fin->fin_ifs;
2380 2381  
2381 2382          /*
2382 2383           * Does it at least have the return (basic) IP header ?
2383 2384           * Is it an actual recognised ICMP error type?
2384 2385           * Only a basic IP header (no options) should be with
2385 2386           * an ICMP error header.
2386 2387           */
2387 2388          if ((fin->fin_v != 4) || (fin->fin_hlen != sizeof(ip_t)) ||
2388 2389              (fin->fin_plen < ICMPERR_MINPKTLEN) ||
2389 2390              !(fin->fin_flx & FI_ICMPERR))
2390 2391                  return NULL;
2391 2392          ic = fin->fin_dp;
2392 2393  
2393 2394          oip = (ip_t *)((char *)ic + ICMPERR_ICMPHLEN);
2394 2395          /*
2395 2396           * Check if the at least the old IP header (with options) and
2396 2397           * 8 bytes of payload is present.
2397 2398           */
2398 2399          if (fin->fin_plen < ICMPERR_MAXPKTLEN + ((IP_HL(oip) - 5) << 2))
2399 2400                  return NULL;
2400 2401  
2401 2402          /*
2402 2403           * Sanity Checks.
2403 2404           */
2404 2405          len = fin->fin_dlen - ICMPERR_ICMPHLEN;
2405 2406          if ((len <= 0) || ((IP_HL(oip) << 2) > len))
2406 2407                  return NULL;
2407 2408  
2408 2409          /*
2409 2410           * Is the buffer big enough for all of it ?  It's the size of the IP
2410 2411           * header claimed in the encapsulated part which is of concern.  It
2411 2412           * may be too big to be in this buffer but not so big that it's
2412 2413           * outside the ICMP packet, leading to TCP deref's causing problems.
2413 2414           * This is possible because we don't know how big oip_hl is when we
2414 2415           * do the pullup early in fr_check() and thus can't guarantee it is
2415 2416           * all here now.
2416 2417           */
2417 2418  #ifdef  _KERNEL
2418 2419          {
2419 2420          mb_t *m;
2420 2421  
2421 2422          m = fin->fin_m;
2422 2423  # if defined(MENTAT)
2423 2424          if ((char *)oip + len > (char *)m->b_wptr)
2424 2425                  return NULL;
2425 2426  # else
2426 2427          if ((char *)oip + len > (char *)fin->fin_ip + m->m_len)
2427 2428                  return NULL;
2428 2429  # endif
2429 2430          }
2430 2431  #endif
2431 2432          bcopy((char *)fin, (char *)&ofin, sizeof(*fin));
2432 2433  
2433 2434          /*
2434 2435           * in the IPv4 case we must zero the i6addr union otherwise
2435 2436           * the IP6_EQ and IP6_NEQ macros produce the wrong results because
2436 2437           * of the 'junk' in the unused part of the union
2437 2438           */
2438 2439          bzero((char *)&src, sizeof(src));
2439 2440          bzero((char *)&dst, sizeof(dst));
2440 2441  
2441 2442          /*
2442 2443           * we make an fin entry to be able to feed it to
2443 2444           * matchsrcdst note that not all fields are encessary
2444 2445           * but this is the cleanest way. Note further we fill
2445 2446           * in fin_mp such that if someone uses it we'll get
2446 2447           * a kernel panic. fr_matchsrcdst does not use this.
2447 2448           *
2448 2449           * watch out here, as ip is in host order and oip in network
2449 2450           * order. Any change we make must be undone afterwards, like
2450 2451           * oip->ip_off - it is still in network byte order so fix it.
2451 2452           */
2452 2453          savelen = oip->ip_len;
2453 2454          oip->ip_len = len;
2454 2455          oip->ip_off = ntohs(oip->ip_off);
2455 2456  
2456 2457          ofin.fin_flx = FI_NOCKSUM;
2457 2458          ofin.fin_v = 4;
2458 2459          ofin.fin_ip = oip;
2459 2460          ofin.fin_m = NULL;      /* if dereferenced, panic XXX */
2460 2461          ofin.fin_mp = NULL;     /* if dereferenced, panic XXX */
2461 2462          ofin.fin_plen = fin->fin_dlen - ICMPERR_ICMPHLEN;
2462 2463          (void) fr_makefrip(IP_HL(oip) << 2, oip, &ofin);
2463 2464          ofin.fin_ifp = fin->fin_ifp;
2464 2465          ofin.fin_out = !fin->fin_out;
2465 2466          /*
2466 2467           * Reset the short and bad flag here because in fr_matchsrcdst()
2467 2468           * the flags for the current packet (fin_flx) are compared against
2468 2469           * those for the existing session.
2469 2470           */
2470 2471          ofin.fin_flx &= ~(FI_BAD|FI_SHORT);
2471 2472  
2472 2473          /*
2473 2474           * Put old values of ip_len and ip_off back as we don't know
2474 2475           * if we have to forward the packet (or process it again.
2475 2476           */
2476 2477          oip->ip_len = savelen;
2477 2478          oip->ip_off = htons(oip->ip_off);
2478 2479  
2479 2480          switch (oip->ip_p)
2480 2481          {
2481 2482          case IPPROTO_ICMP :
2482 2483                  /*
2483 2484                   * an ICMP error can only be generated as a result of an
2484 2485                   * ICMP query, not as the response on an ICMP error
2485 2486                   *
2486 2487                   * XXX theoretically ICMP_ECHOREP and the other reply's are
2487 2488                   * ICMP query's as well, but adding them here seems strange XXX
2488 2489                   */
2489 2490                  if ((ofin.fin_flx & FI_ICMPERR) != 0)
2490 2491                          return NULL;
2491 2492  
2492 2493                  /*
2493 2494                   * perform a lookup of the ICMP packet in the state table
2494 2495                   */
2495 2496                  icmp = (icmphdr_t *)((char *)oip + (IP_HL(oip) << 2));
2496 2497                  hv = (pr = oip->ip_p);
2497 2498                  src.in4 = oip->ip_src;
2498 2499                  hv += src.in4.s_addr;
2499 2500                  dst.in4 = oip->ip_dst;
2500 2501                  hv += dst.in4.s_addr;
2501 2502                  hv += icmp->icmp_id;
2502 2503                  hv = DOUBLE_HASH(hv, ifs);
2503 2504  
2504 2505                  READ_ENTER(&ifs->ifs_ipf_state);
2505 2506                  for (isp = &ifs->ifs_ips_table[hv]; ((is = *isp) != NULL); ) {
2506 2507                          isp = &is->is_hnext;
2507 2508                          if ((is->is_p != pr) || (is->is_v != 4))
2508 2509                                  continue;
2509 2510                          if (is->is_pass & FR_NOICMPERR)
2510 2511                                  continue;
2511 2512                          is = fr_matchsrcdst(&ofin, is, &src, &dst,
2512 2513                                              NULL, FI_ICMPCMP);
2513 2514                          if (is != NULL) {
2514 2515                                  if ((is->is_pass & FR_NOICMPERR) != 0) {
2515 2516                                          RWLOCK_EXIT(&ifs->ifs_ipf_state);
2516 2517                                          return NULL;
2517 2518                                  }
2518 2519                                  /*
2519 2520                                   * i  : the index of this packet (the icmp
2520 2521                                   *      unreachable)
2521 2522                                   * oi : the index of the original packet found
2522 2523                                   *      in the icmp header (i.e. the packet
2523 2524                                   *      causing this icmp)
2524 2525                                   * backward : original packet was backward
2525 2526                                   *      compared to the state
2526 2527                                   */
2527 2528                                  backward = IP6_NEQ(&is->is_src, &src);
2528 2529                                  fin->fin_rev = !backward;
2529 2530                                  i = (!backward << 1) + fin->fin_out;
2530 2531                                  oi = (backward << 1) + ofin.fin_out;
2531 2532                                  if (is->is_icmppkts[i] > is->is_pkts[oi])
2532 2533                                          continue;
2533 2534                                  ifs->ifs_ips_stats.iss_hits++;
2534 2535                                  is->is_icmppkts[i]++;
2535 2536                                  return is;
2536 2537                          }
2537 2538                  }
2538 2539                  RWLOCK_EXIT(&ifs->ifs_ipf_state);
2539 2540                  return NULL;
2540 2541          case IPPROTO_TCP :
2541 2542          case IPPROTO_UDP :
2542 2543                  break;
2543 2544          default :
2544 2545                  return NULL;
2545 2546          }
2546 2547  
2547 2548          tcp = (tcphdr_t *)((char *)oip + (IP_HL(oip) << 2));
2548 2549          dport = tcp->th_dport;
2549 2550          sport = tcp->th_sport;
2550 2551  
2551 2552          hv = (pr = oip->ip_p);
2552 2553          src.in4 = oip->ip_src;
2553 2554          hv += src.in4.s_addr;
2554 2555          dst.in4 = oip->ip_dst;
2555 2556          hv += dst.in4.s_addr;
2556 2557          hv += dport;
2557 2558          hv += sport;
2558 2559          hv = DOUBLE_HASH(hv, ifs);
2559 2560  
2560 2561          READ_ENTER(&ifs->ifs_ipf_state);
2561 2562          for (isp = &ifs->ifs_ips_table[hv]; ((is = *isp) != NULL); ) {
2562 2563                  isp = &is->is_hnext;
2563 2564                  /*
2564 2565                   * Only allow this icmp though if the
2565 2566                   * encapsulated packet was allowed through the
2566 2567                   * other way around. Note that the minimal amount
2567 2568                   * of info present does not allow for checking against
2568 2569                   * tcp internals such as seq and ack numbers.   Only the
2569 2570                   * ports are known to be present and can be even if the
2570 2571                   * short flag is set.
2571 2572                   */
2572 2573                  if ((is->is_p == pr) && (is->is_v == 4) &&
2573 2574                      (is = fr_matchsrcdst(&ofin, is, &src, &dst,
2574 2575                                           tcp, FI_ICMPCMP))) {
2575 2576                          /*
2576 2577                           * i  : the index of this packet (the icmp unreachable)
2577 2578                           * oi : the index of the original packet found in the
2578 2579                           *      icmp header (i.e. the packet causing this icmp)
2579 2580                           * backward : original packet was backward compared to
2580 2581                           *            the state
2581 2582                           */
2582 2583                          backward = IP6_NEQ(&is->is_src, &src);
2583 2584                          fin->fin_rev = !backward;
2584 2585                          i = (!backward << 1) + fin->fin_out;
2585 2586                          oi = (backward << 1) + ofin.fin_out;
2586 2587  
2587 2588                          if (((is->is_pass & FR_NOICMPERR) != 0) ||
2588 2589                              (is->is_icmppkts[i] > is->is_pkts[oi]))
2589 2590                                  break;
2590 2591                          ifs->ifs_ips_stats.iss_hits++;
2591 2592                          is->is_icmppkts[i]++;
2592 2593                          /*
2593 2594                           * we deliberately do not touch the timeouts
2594 2595                           * for the accompanying state table entry.
2595 2596                           * It remains to be seen if that is correct. XXX
2596 2597                           */
2597 2598                          return is;
2598 2599                  }
2599 2600          }
2600 2601          RWLOCK_EXIT(&ifs->ifs_ipf_state);
2601 2602          return NULL;
2602 2603  }
2603 2604  
2604 2605  
2605 2606  /* ------------------------------------------------------------------------ */
2606 2607  /* Function:    fr_ipsmove                                                  */
2607 2608  /* Returns:     Nil                                                         */
2608 2609  /* Parameters:  is(I) - pointer to state table entry                        */
2609 2610  /*              hv(I) - new hash value for state table entry                */
2610 2611  /* Write Locks: ipf_state                                                   */
2611 2612  /*                                                                          */
2612 2613  /* Move a state entry from one position in the hash table to another.       */
2613 2614  /* ------------------------------------------------------------------------ */
2614 2615  static void fr_ipsmove(is, hv, ifs)
2615 2616  ipstate_t *is;
2616 2617  u_int hv;
2617 2618  ipf_stack_t *ifs;
2618 2619  {
2619 2620          ipstate_t **isp;
2620 2621          u_int hvm;
2621 2622  
2622 2623          ASSERT(rw_read_locked(&ifs->ifs_ipf_state.ipf_lk) == 0);
2623 2624  
2624 2625          hvm = is->is_hv;
2625 2626          /*
2626 2627           * Remove the hash from the old location...
2627 2628           */
2628 2629          isp = is->is_phnext;
2629 2630          if (is->is_hnext)
2630 2631                  is->is_hnext->is_phnext = isp;
2631 2632          *isp = is->is_hnext;
2632 2633          if (ifs->ifs_ips_table[hvm] == NULL)
2633 2634                  ifs->ifs_ips_stats.iss_inuse--;
2634 2635          ifs->ifs_ips_stats.iss_bucketlen[hvm]--;
2635 2636  
2636 2637          /*
2637 2638           * ...and put the hash in the new one.
2638 2639           */
2639 2640          hvm = DOUBLE_HASH(hv, ifs);
2640 2641          is->is_hv = hvm;
2641 2642          isp = &ifs->ifs_ips_table[hvm];
2642 2643          if (*isp)
2643 2644                  (*isp)->is_phnext = &is->is_hnext;
2644 2645          else
2645 2646                  ifs->ifs_ips_stats.iss_inuse++;
2646 2647          ifs->ifs_ips_stats.iss_bucketlen[hvm]++;
2647 2648          is->is_phnext = isp;
2648 2649          is->is_hnext = *isp;
2649 2650          *isp = is;
2650 2651  }
2651 2652  
2652 2653  
2653 2654  /* ------------------------------------------------------------------------ */
2654 2655  /* Function:    fr_stlookup                                                 */
2655 2656  /* Returns:     ipstate_t* - NULL == no matching state found,               */
2656 2657  /*                           else pointer to state information is returned  */
2657 2658  /* Parameters:  fin(I) - pointer to packet information                      */
2658 2659  /*              tcp(I) - pointer to TCP/UDP header.                         */
2659 2660  /*                                                                          */
2660 2661  /* Search the state table for a matching entry to the packet described by   */
2661 2662  /* the contents of *fin.                                                    */
2662 2663  /*                                                                          */
2663 2664  /* If we return NULL then no lock on ipf_state is held.                     */
2664 2665  /* If we return non-null then a read-lock on ipf_state is held.             */
2665 2666  /* ------------------------------------------------------------------------ */
2666 2667  ipstate_t *fr_stlookup(fin, tcp, ifqp)
2667 2668  fr_info_t *fin;
2668 2669  tcphdr_t *tcp;
2669 2670  ipftq_t **ifqp;
2670 2671  {
2671 2672          u_int hv, hvm, pr, v, tryagain;
2672 2673          ipstate_t *is, **isp;
2673 2674          u_short dport, sport;
2674 2675          i6addr_t src, dst;
2675 2676          struct icmp *ic;
2676 2677          ipftq_t *ifq;
2677 2678          int oow;
2678 2679          ipf_stack_t *ifs = fin->fin_ifs;
2679 2680  
2680 2681          is = NULL;
2681 2682          ifq = NULL;
2682 2683          tcp = fin->fin_dp;
2683 2684          ic = (struct icmp *)tcp;
2684 2685          hv = (pr = fin->fin_fi.fi_p);
2685 2686          src = fin->fin_fi.fi_src;
2686 2687          dst = fin->fin_fi.fi_dst;
2687 2688          hv += src.in4.s_addr;
2688 2689          hv += dst.in4.s_addr;
2689 2690  
2690 2691          v = fin->fin_fi.fi_v;
2691 2692  #ifdef  USE_INET6
2692 2693          if (v == 6) {
2693 2694                  hv  += fin->fin_fi.fi_src.i6[1];
2694 2695                  hv  += fin->fin_fi.fi_src.i6[2];
2695 2696                  hv  += fin->fin_fi.fi_src.i6[3];
2696 2697  
2697 2698                  if ((fin->fin_p == IPPROTO_ICMPV6) &&
2698 2699                      IN6_IS_ADDR_MULTICAST(&fin->fin_fi.fi_dst.in6)) {
2699 2700                          hv -= dst.in4.s_addr;
2700 2701                  } else {
2701 2702                          hv += fin->fin_fi.fi_dst.i6[1];
2702 2703                          hv += fin->fin_fi.fi_dst.i6[2];
2703 2704                          hv += fin->fin_fi.fi_dst.i6[3];
2704 2705                  }
2705 2706          }
2706 2707  #endif
2707 2708          if ((v == 4) &&
2708 2709              (fin->fin_flx & (FI_MULTICAST|FI_BROADCAST|FI_MBCAST))) {
2709 2710                  if (fin->fin_out == 0) {
2710 2711                          hv -= src.in4.s_addr;
2711 2712                  } else {
2712 2713                          hv -= dst.in4.s_addr;
2713 2714                  }
2714 2715          }
2715 2716  
2716 2717          /*
2717 2718           * Search the hash table for matching packet header info.
2718 2719           */
2719 2720          switch (pr)
2720 2721          {
2721 2722  #ifdef  USE_INET6
2722 2723          case IPPROTO_ICMPV6 :
2723 2724                  tryagain = 0;
2724 2725                  if (v == 6) {
2725 2726                          if ((ic->icmp_type == ICMP6_ECHO_REQUEST) ||
2726 2727                              (ic->icmp_type == ICMP6_ECHO_REPLY)) {
2727 2728                                  hv += ic->icmp_id;
2728 2729                          }
2729 2730                  }
2730 2731                  READ_ENTER(&ifs->ifs_ipf_state);
2731 2732  icmp6again:
2732 2733                  hvm = DOUBLE_HASH(hv, ifs);
2733 2734                  for (isp = &ifs->ifs_ips_table[hvm]; ((is = *isp) != NULL); ) {
2734 2735                          isp = &is->is_hnext;
2735 2736                          if ((is->is_p != pr) || (is->is_v != v))
2736 2737                                  continue;
2737 2738                          is = fr_matchsrcdst(fin, is, &src, &dst, NULL, FI_CMP);
2738 2739                          if (is != NULL &&
2739 2740                              fr_matchicmpqueryreply(v, &is->is_icmp,
2740 2741                                                     ic, fin->fin_rev)) {
2741 2742                                  if (fin->fin_rev)
2742 2743                                          ifq = &ifs->ifs_ips_icmpacktq;
2743 2744                                  else
2744 2745                                          ifq = &ifs->ifs_ips_icmptq;
2745 2746                                  break;
2746 2747                          }
2747 2748                  }
2748 2749  
2749 2750                  if (is != NULL) {
2750 2751                          if ((tryagain != 0) && !(is->is_flags & SI_W_DADDR)) {
2751 2752                                  hv += fin->fin_fi.fi_src.i6[0];
2752 2753                                  hv += fin->fin_fi.fi_src.i6[1];
2753 2754                                  hv += fin->fin_fi.fi_src.i6[2];
2754 2755                                  hv += fin->fin_fi.fi_src.i6[3];
2755 2756                                  fr_ipsmove(is, hv, ifs);
2756 2757                                  MUTEX_DOWNGRADE(&ifs->ifs_ipf_state);
2757 2758                          }
2758 2759                          break;
2759 2760                  }
2760 2761                  RWLOCK_EXIT(&ifs->ifs_ipf_state);
2761 2762  
2762 2763                  /*
2763 2764                   * No matching icmp state entry. Perhaps this is a
2764 2765                   * response to another state entry.
2765 2766                   *
2766 2767                   * XXX With some ICMP6 packets, the "other" address is already
2767 2768                   * in the packet, after the ICMP6 header, and this could be
2768 2769                   * used in place of the multicast address.  However, taking
2769 2770                   * advantage of this requires some significant code changes
2770 2771                   * to handle the specific types where that is the case.
2771 2772                   */
2772 2773                  if ((ifs->ifs_ips_stats.iss_wild != 0) && (v == 6) && (tryagain == 0) &&
2773 2774                      !IN6_IS_ADDR_MULTICAST(&fin->fin_fi.fi_src.in6)) {
2774 2775                          hv -= fin->fin_fi.fi_src.i6[0];
2775 2776                          hv -= fin->fin_fi.fi_src.i6[1];
2776 2777                          hv -= fin->fin_fi.fi_src.i6[2];
2777 2778                          hv -= fin->fin_fi.fi_src.i6[3];
2778 2779                          tryagain = 1;
2779 2780                          WRITE_ENTER(&ifs->ifs_ipf_state);
2780 2781                          goto icmp6again;
2781 2782                  }
2782 2783  
2783 2784                  is = fr_checkicmp6matchingstate(fin);
2784 2785                  if (is != NULL)
2785 2786                          return is;
2786 2787                  break;
2787 2788  #endif
2788 2789  
2789 2790          case IPPROTO_ICMP :
2790 2791                  if (v == 4) {
2791 2792                          hv += ic->icmp_id;
2792 2793                  }
2793 2794                  hv = DOUBLE_HASH(hv, ifs);
2794 2795                  READ_ENTER(&ifs->ifs_ipf_state);
2795 2796                  for (isp = &ifs->ifs_ips_table[hv]; ((is = *isp) != NULL); ) {
2796 2797                          isp = &is->is_hnext;
2797 2798                          if ((is->is_p != pr) || (is->is_v != v))
2798 2799                                  continue;
2799 2800                          is = fr_matchsrcdst(fin, is, &src, &dst, NULL, FI_CMP);
2800 2801                          if (is != NULL &&
2801 2802                              fr_matchicmpqueryreply(v, &is->is_icmp,
2802 2803                                                     ic, fin->fin_rev)) {
2803 2804                                  if (fin->fin_rev)
2804 2805                                          ifq = &ifs->ifs_ips_icmpacktq;
2805 2806                                  else
2806 2807                                          ifq = &ifs->ifs_ips_icmptq;
2807 2808                                  break;
2808 2809                          }
2809 2810                  }
2810 2811                  if (is == NULL) {
2811 2812                          RWLOCK_EXIT(&ifs->ifs_ipf_state);
2812 2813                  }
2813 2814                  break;
2814 2815  
2815 2816          case IPPROTO_TCP :
2816 2817          case IPPROTO_UDP :
2817 2818                  ifqp = NULL;
2818 2819                  sport = htons(fin->fin_data[0]);
2819 2820                  hv += sport;
2820 2821                  dport = htons(fin->fin_data[1]);
2821 2822                  hv += dport;
2822 2823                  oow = 0;
2823 2824                  tryagain = 0;
2824 2825                  READ_ENTER(&ifs->ifs_ipf_state);
2825 2826  retry_tcpudp:
2826 2827                  hvm = DOUBLE_HASH(hv, ifs);
2827 2828                  for (isp = &ifs->ifs_ips_table[hvm]; ((is = *isp) != NULL); ) {
2828 2829                          isp = &is->is_hnext;
2829 2830                          if ((is->is_p != pr) || (is->is_v != v))
2830 2831                                  continue;
2831 2832                          fin->fin_flx &= ~FI_OOW;
2832 2833                          is = fr_matchsrcdst(fin, is, &src, &dst, tcp, FI_CMP);
2833 2834                          if (is != NULL) {
2834 2835                                  if (pr == IPPROTO_TCP) {
2835 2836                                          if (!fr_tcpstate(fin, tcp, is)) {
2836 2837                                                  oow |= fin->fin_flx & FI_OOW;
2837 2838                                                  continue;
2838 2839                                          }
2839 2840                                  }
2840 2841                                  break;
2841 2842                          }
2842 2843                  }
2843 2844                  if (is != NULL) {
2844 2845                          if (tryagain &&
2845 2846                              !(is->is_flags & (SI_CLONE|SI_WILDP|SI_WILDA))) {
2846 2847                                  hv += dport;
2847 2848                                  hv += sport;
2848 2849                                  fr_ipsmove(is, hv, ifs);
2849 2850                                  MUTEX_DOWNGRADE(&ifs->ifs_ipf_state);
2850 2851                          }
2851 2852                          break;
2852 2853                  }
2853 2854                  RWLOCK_EXIT(&ifs->ifs_ipf_state);
2854 2855  
2855 2856                  if (ifs->ifs_ips_stats.iss_wild) {
2856 2857                          if (tryagain == 0) {
2857 2858                                  hv -= dport;
2858 2859                                  hv -= sport;
2859 2860                          } else if (tryagain == 1) {
2860 2861                                  hv = fin->fin_fi.fi_p;
2861 2862                                  /*
2862 2863                                   * If we try to pretend this is a reply to a
2863 2864                                   * multicast/broadcast packet then we need to
2864 2865                                   * exclude part of the address from the hash
2865 2866                                   * calculation.
2866 2867                                   */
2867 2868                                  if (fin->fin_out == 0) {
2868 2869                                          hv += src.in4.s_addr;
2869 2870                                  } else {
2870 2871                                          hv += dst.in4.s_addr;
2871 2872                                  }
2872 2873                                  hv += dport;
2873 2874                                  hv += sport;
2874 2875                          }
2875 2876                          tryagain++;
2876 2877                          if (tryagain <= 2) {
2877 2878                                  WRITE_ENTER(&ifs->ifs_ipf_state);
2878 2879                                  goto retry_tcpudp;
2879 2880                          }
2880 2881                  }
2881 2882                  fin->fin_flx |= oow;
2882 2883                  break;
2883 2884  
2884 2885  #if 0
2885 2886          case IPPROTO_GRE :
2886 2887                  gre = fin->fin_dp;
2887 2888                  if (GRE_REV(gre->gr_flags) == 1) {
2888 2889                          hv += gre->gr_call;
2889 2890                  }
2890 2891                  /* FALLTHROUGH */
2891 2892  #endif
2892 2893          default :
2893 2894                  ifqp = NULL;
2894 2895                  hvm = DOUBLE_HASH(hv, ifs);
2895 2896                  READ_ENTER(&ifs->ifs_ipf_state);
2896 2897                  for (isp = &ifs->ifs_ips_table[hvm]; ((is = *isp) != NULL); ) {
2897 2898                          isp = &is->is_hnext;
2898 2899                          if ((is->is_p != pr) || (is->is_v != v))
2899 2900                                  continue;
2900 2901                          is = fr_matchsrcdst(fin, is, &src, &dst, NULL, FI_CMP);
2901 2902                          if (is != NULL) {
2902 2903                                  ifq = &ifs->ifs_ips_iptq;
2903 2904                                  break;
2904 2905                          }
2905 2906                  }
2906 2907                  if (is == NULL) {
2907 2908                          RWLOCK_EXIT(&ifs->ifs_ipf_state);
2908 2909                  }
2909 2910                  break;
2910 2911          }
2911 2912  
2912 2913          if ((is != NULL) && ((is->is_sti.tqe_flags & TQE_RULEBASED) != 0) &&
2913 2914              (is->is_tqehead[fin->fin_rev] != NULL))
2914 2915                  ifq = is->is_tqehead[fin->fin_rev];
2915 2916          if (ifq != NULL && ifqp != NULL)
2916 2917                  *ifqp = ifq;
2917 2918          return is;
2918 2919  }
2919 2920  
2920 2921  
2921 2922  /* ------------------------------------------------------------------------ */
2922 2923  /* Function:    fr_updatestate                                              */
2923 2924  /* Returns:     Nil                                                         */
2924 2925  /* Parameters:  fin(I) - pointer to packet information                      */
2925 2926  /*              is(I)  - pointer to state table entry                       */
2926 2927  /* Read Locks:  ipf_state                                                   */
2927 2928  /*                                                                          */
2928 2929  /* Updates packet and byte counters for a newly received packet.  Seeds the */
2929 2930  /* fragment cache with a new entry as required.                             */
2930 2931  /* ------------------------------------------------------------------------ */
2931 2932  void fr_updatestate(fin, is, ifq)
2932 2933  fr_info_t *fin;
2933 2934  ipstate_t *is;
2934 2935  ipftq_t *ifq;
2935 2936  {
2936 2937          ipftqent_t *tqe;
2937 2938          int i, pass;
2938 2939          ipf_stack_t *ifs = fin->fin_ifs;
2939 2940  
2940 2941          i = (fin->fin_rev << 1) + fin->fin_out;
2941 2942  
2942 2943          /*
2943 2944           * For TCP packets, ifq == NULL.  For all others, check if this new
2944 2945           * queue is different to the last one it was on and move it if so.
2945 2946           */
2946 2947          tqe = &is->is_sti;
2947 2948          MUTEX_ENTER(&is->is_lock);
2948 2949          if ((tqe->tqe_flags & TQE_RULEBASED) != 0)
2949 2950                  ifq = is->is_tqehead[fin->fin_rev];
2950 2951  
2951 2952          if (ifq != NULL)
2952 2953                  fr_movequeue(tqe, tqe->tqe_ifq, ifq, ifs);
2953 2954  
2954 2955          is->is_pkts[i]++;
2955 2956          fin->fin_pktnum = is->is_pkts[i] + is->is_icmppkts[i];
2956 2957          is->is_bytes[i] += fin->fin_plen;
2957 2958          MUTEX_EXIT(&is->is_lock);
2958 2959  
2959 2960  #ifdef  IPFILTER_SYNC
2960 2961          if (is->is_flags & IS_STATESYNC)
2961 2962                  ipfsync_update(SMC_STATE, fin, is->is_sync);
2962 2963  #endif
2963 2964  
2964 2965          ATOMIC_INCL(ifs->ifs_ips_stats.iss_hits);
2965 2966  
2966 2967          fin->fin_fr = is->is_rule;
2967 2968  
2968 2969          /*
2969 2970           * If this packet is a fragment and the rule says to track fragments,
2970 2971           * then create a new fragment cache entry.
2971 2972           */
2972 2973          pass = is->is_pass;
2973 2974          if ((fin->fin_flx & FI_FRAG) && FR_ISPASS(pass))
2974 2975                  (void) fr_newfrag(fin, pass ^ FR_KEEPSTATE);
2975 2976  }
2976 2977  
2977 2978  
2978 2979  /* ------------------------------------------------------------------------ */
2979 2980  /* Function:    fr_checkstate                                               */
2980 2981  /* Returns:     frentry_t* - NULL == search failed,                         */
2981 2982  /*                           else pointer to rule for matching state        */
2982 2983  /* Parameters:  ifp(I)   - pointer to interface                             */
2983 2984  /*              passp(I) - pointer to filtering result flags                */
2984 2985  /*                                                                          */
2985 2986  /* Check if a packet is associated with an entry in the state table.        */
2986 2987  /* ------------------------------------------------------------------------ */
2987 2988  frentry_t *fr_checkstate(fin, passp)
2988 2989  fr_info_t *fin;
2989 2990  u_32_t *passp;
2990 2991  {
2991 2992          ipstate_t *is;
2992 2993          frentry_t *fr;
2993 2994          tcphdr_t *tcp;
2994 2995          ipftq_t *ifq;
2995 2996          u_int pass;
2996 2997          ipf_stack_t *ifs = fin->fin_ifs;
2997 2998  
2998 2999          if (ifs->ifs_fr_state_lock || (ifs->ifs_ips_list == NULL) ||
2999 3000              (fin->fin_flx & (FI_SHORT|FI_STATE|FI_FRAGBODY|FI_BAD)))
3000 3001                  return NULL;
3001 3002  
3002 3003          is = NULL;
3003 3004          if ((fin->fin_flx & FI_TCPUDP) ||
3004 3005              (fin->fin_fi.fi_p == IPPROTO_ICMP)
3005 3006  #ifdef  USE_INET6
3006 3007              || (fin->fin_fi.fi_p == IPPROTO_ICMPV6)
3007 3008  #endif
3008 3009              )
3009 3010                  tcp = fin->fin_dp;
3010 3011          else
3011 3012                  tcp = NULL;
3012 3013  
3013 3014          /*
3014 3015           * Search the hash table for matching packet header info.
3015 3016           */
3016 3017          ifq = NULL;
3017 3018          is = fr_stlookup(fin, tcp, &ifq);
3018 3019          switch (fin->fin_p)
3019 3020          {
3020 3021  #ifdef  USE_INET6
3021 3022          case IPPROTO_ICMPV6 :
3022 3023                  if (is != NULL)
3023 3024                          break;
3024 3025                  if (fin->fin_v == 6) {
3025 3026                          is = fr_checkicmp6matchingstate(fin);
3026 3027                          if (is != NULL)
3027 3028                                  goto matched;
3028 3029                  }
3029 3030                  break;
3030 3031  #endif
3031 3032          case IPPROTO_ICMP :
3032 3033                  if (is != NULL)
3033 3034                          break;
3034 3035                  /*
3035 3036                   * No matching icmp state entry. Perhaps this is a
3036 3037                   * response to another state entry.
3037 3038                   */
3038 3039                  is = fr_checkicmpmatchingstate(fin);
3039 3040                  if (is != NULL)
3040 3041                          goto matched;
3041 3042                  break;
3042 3043          case IPPROTO_TCP :
3043 3044                  if (is == NULL)
3044 3045                          break;
3045 3046  
3046 3047                  if (is->is_pass & FR_NEWISN) {
3047 3048                          if (fin->fin_out == 0)
3048 3049                                  fr_fixinisn(fin, is);
3049 3050                          else if (fin->fin_out == 1)
3050 3051                                  fr_fixoutisn(fin, is);
3051 3052                  }
3052 3053                  break;
3053 3054          default :
3054 3055                  if (fin->fin_rev)
3055 3056                          ifq = &ifs->ifs_ips_udpacktq;
3056 3057                  else
3057 3058                          ifq = &ifs->ifs_ips_udptq;
3058 3059                  break;
3059 3060          }
3060 3061          if (is == NULL) {
3061 3062                  ATOMIC_INCL(ifs->ifs_ips_stats.iss_miss);
3062 3063                  return NULL;
3063 3064          }
3064 3065  
3065 3066  matched:
3066 3067          fr = is->is_rule;
3067 3068          if (fr != NULL) {
3068 3069                  if ((fin->fin_out == 0) && (fr->fr_nattag.ipt_num[0] != 0)) {
3069 3070                          if (fin->fin_nattag == NULL) {
3070 3071                                  RWLOCK_EXIT(&ifs->ifs_ipf_state);
3071 3072                                  return NULL;
3072 3073                          }
3073 3074                          if (fr_matchtag(&fr->fr_nattag, fin->fin_nattag) != 0) {
3074 3075                                  RWLOCK_EXIT(&ifs->ifs_ipf_state);
3075 3076                                  return NULL;
3076 3077                          }
3077 3078                  }
3078 3079                  (void) strncpy(fin->fin_group, fr->fr_group, FR_GROUPLEN);
3079 3080                  fin->fin_icode = fr->fr_icode;
3080 3081          }
3081 3082  
3082 3083          fin->fin_rule = is->is_rulen;
3083 3084          pass = is->is_pass;
3084 3085          fr_updatestate(fin, is, ifq);
3085 3086  
3086 3087          RWLOCK_EXIT(&ifs->ifs_ipf_state);
3087 3088          fin->fin_flx |= FI_STATE;
3088 3089          if ((pass & FR_LOGFIRST) != 0)
3089 3090                  pass &= ~(FR_LOGFIRST|FR_LOG);
3090 3091          *passp = pass;
3091 3092          return fr;
3092 3093  }
3093 3094  
3094 3095  
3095 3096  /* ------------------------------------------------------------------------ */
3096 3097  /* Function:    fr_fixoutisn                                                */
3097 3098  /* Returns:     Nil                                                         */
3098 3099  /* Parameters:  fin(I)   - pointer to packet information                    */
3099 3100  /*              is(I)  - pointer to master state structure                  */
3100 3101  /*                                                                          */
3101 3102  /* Called only for outbound packets, adjusts the sequence number and the    */
3102 3103  /* TCP checksum to match that change.                                       */
3103 3104  /* ------------------------------------------------------------------------ */
3104 3105  static void fr_fixoutisn(fin, is)
3105 3106  fr_info_t *fin;
3106 3107  ipstate_t *is;
3107 3108  {
3108 3109          tcphdr_t *tcp;
3109 3110          int rev;
3110 3111          u_32_t seq;
3111 3112  
3112 3113          tcp = fin->fin_dp;
3113 3114          rev = fin->fin_rev;
3114 3115          if ((is->is_flags & IS_ISNSYN) != 0) {
3115 3116                  if (rev == 0) {
3116 3117                          seq = ntohl(tcp->th_seq);
3117 3118                          seq += is->is_isninc[0];
3118 3119                          tcp->th_seq = htonl(seq);
3119 3120                          fix_outcksum(&tcp->th_sum, is->is_sumd[0]);
3120 3121                  }
3121 3122          }
3122 3123          if ((is->is_flags & IS_ISNACK) != 0) {
3123 3124                  if (rev == 1) {
3124 3125                          seq = ntohl(tcp->th_seq);
3125 3126                          seq += is->is_isninc[1];
3126 3127                          tcp->th_seq = htonl(seq);
3127 3128                          fix_outcksum(&tcp->th_sum, is->is_sumd[1]);
3128 3129                  }
3129 3130          }
3130 3131  }
3131 3132  
3132 3133  
3133 3134  /* ------------------------------------------------------------------------ */
3134 3135  /* Function:    fr_fixinisn                                                 */
3135 3136  /* Returns:     Nil                                                         */
3136 3137  /* Parameters:  fin(I)   - pointer to packet information                    */
3137 3138  /*              is(I)  - pointer to master state structure                  */
3138 3139  /*                                                                          */
3139 3140  /* Called only for inbound packets, adjusts the acknowledge number and the  */
3140 3141  /* TCP checksum to match that change.                                       */
3141 3142  /* ------------------------------------------------------------------------ */
3142 3143  static void fr_fixinisn(fin, is)
3143 3144  fr_info_t *fin;
3144 3145  ipstate_t *is;
3145 3146  {
3146 3147          tcphdr_t *tcp;
3147 3148          int rev;
3148 3149          u_32_t ack;
3149 3150  
3150 3151          tcp = fin->fin_dp;
3151 3152          rev = fin->fin_rev;
3152 3153          if ((is->is_flags & IS_ISNSYN) != 0) {
3153 3154                  if (rev == 1) {
3154 3155                          ack = ntohl(tcp->th_ack);
3155 3156                          ack -= is->is_isninc[0];
3156 3157                          tcp->th_ack = htonl(ack);
3157 3158                          fix_incksum(&tcp->th_sum, is->is_sumd[0]);
3158 3159                  }
3159 3160          }
3160 3161          if ((is->is_flags & IS_ISNACK) != 0) {
3161 3162                  if (rev == 0) {
3162 3163                          ack = ntohl(tcp->th_ack);
3163 3164                          ack -= is->is_isninc[1];
3164 3165                          tcp->th_ack = htonl(ack);
3165 3166                          fix_incksum(&tcp->th_sum, is->is_sumd[1]);
3166 3167                  }
3167 3168          }
3168 3169  }
3169 3170  
3170 3171  
3171 3172  /* ------------------------------------------------------------------------ */
3172 3173  /* Function:    fr_statesync                                                */
3173 3174  /* Returns:     Nil                                                         */
3174 3175  /* Parameters:  action(I) - type of synchronisation to do                   */
3175 3176  /*              v(I)      - IP version being sync'd (v4 or v6)              */
3176 3177  /*              ifp(I)    - interface identifier associated with action     */
3177 3178  /*              name(I)   - name associated with ifp parameter              */
3178 3179  /*                                                                          */
3179 3180  /* Walk through all state entries and if an interface pointer match is      */
3180 3181  /* found then look it up again, based on its name in case the pointer has   */
3181 3182  /* changed since last time.                                                 */
3182 3183  /*                                                                          */
3183 3184  /* If ifp is passed in as being non-null then we are only doing updates for */
3184 3185  /* existing, matching, uses of it.                                          */
3185 3186  /* ------------------------------------------------------------------------ */
3186 3187  void fr_statesync(action, v, ifp, name, ifs)
3187 3188  int action, v;
3188 3189  void *ifp;
3189 3190  char *name;
3190 3191  ipf_stack_t *ifs;
3191 3192  {
3192 3193          ipstate_t *is;
3193 3194          int i;
3194 3195  
3195 3196          if (ifs->ifs_fr_running <= 0)
3196 3197                  return;
3197 3198  
3198 3199          WRITE_ENTER(&ifs->ifs_ipf_state);
3199 3200  
3200 3201          if (ifs->ifs_fr_running <= 0) {
3201 3202                  RWLOCK_EXIT(&ifs->ifs_ipf_state);
3202 3203                  return;
3203 3204          }
3204 3205  
3205 3206          switch (action)
3206 3207          {
3207 3208          case IPFSYNC_RESYNC :
3208 3209                  for (is = ifs->ifs_ips_list; is; is = is->is_next) {
3209 3210                          if (v != 0 && is->is_v != v)
3210 3211                                  continue;
3211 3212                          /*
3212 3213                           * Look up all the interface names in the state entry.
3213 3214                           */
3214 3215                          for (i = 0; i < 4; i++) {
3215 3216                                  is->is_ifp[i] = fr_resolvenic(is->is_ifname[i],
3216 3217                                                                is->is_v, ifs);
3217 3218                          }
3218 3219                  }
3219 3220                  break;
3220 3221          case IPFSYNC_NEWIFP :
3221 3222                  for (is = ifs->ifs_ips_list; is; is = is->is_next) {
3222 3223                          if (v != 0 && is->is_v != v)
3223 3224                                  continue;
3224 3225                          /*
3225 3226                           * Look up all the interface names in the state entry.
3226 3227                           */
3227 3228                          for (i = 0; i < 4; i++) {
3228 3229                                  if (!strncmp(is->is_ifname[i], name,
3229 3230                                               sizeof(is->is_ifname[i])))
3230 3231                                          is->is_ifp[i] = ifp;
3231 3232                          }
3232 3233                  }
3233 3234                  break;
3234 3235          case IPFSYNC_OLDIFP :
3235 3236                  for (is = ifs->ifs_ips_list; is; is = is->is_next) {
3236 3237                          if (v != 0 && is->is_v != v)
3237 3238                                  continue;
3238 3239                          /*
3239 3240                           * Look up all the interface names in the state entry.
3240 3241                           */
3241 3242                          for (i = 0; i < 4; i++) {
3242 3243                                  if (is->is_ifp[i] == ifp)
3243 3244                                          is->is_ifp[i] = (void *)-1;
3244 3245                          }
3245 3246                  }
3246 3247                  break;
3247 3248          }
3248 3249          RWLOCK_EXIT(&ifs->ifs_ipf_state);
3249 3250  }
3250 3251  
3251 3252  
3252 3253  #if SOLARIS2 >= 10
3253 3254  /* ------------------------------------------------------------------------ */
3254 3255  /* Function:    fr_stateifindexsync                                         */
3255 3256  /* Returns:     void                                                        */
3256 3257  /* Parameters:  ifp     - current network interface descriptor (ifindex)    */
3257 3258  /*              newifp  - new interface descriptor (new ifindex)            */
3258 3259  /*              ifs     - pointer to IPF stack                              */
3259 3260  /*                                                                          */
3260 3261  /* Write Locks: assumes ipf_mutex is locked                                 */
3261 3262  /*                                                                          */
3262 3263  /* Updates all interface indeces matching to ifp with new interface index   */
3263 3264  /* value.                                                                   */
3264 3265  /* ------------------------------------------------------------------------ */
3265 3266  void fr_stateifindexsync(ifp, newifp, ifs)
3266 3267  void *ifp;
3267 3268  void *newifp;
3268 3269  ipf_stack_t *ifs;
3269 3270  {
3270 3271          ipstate_t *is;
3271 3272          int i;
3272 3273  
3273 3274          WRITE_ENTER(&ifs->ifs_ipf_state);
3274 3275  
3275 3276          for (is = ifs->ifs_ips_list; is != NULL; is = is->is_next) {
3276 3277  
3277 3278                  for (i = 0; i < 4; i++) {
3278 3279                          if (is->is_ifp[i] == ifp)
3279 3280                                  is->is_ifp[i] = newifp;
3280 3281                  }
3281 3282          }
3282 3283  
3283 3284          RWLOCK_EXIT(&ifs->ifs_ipf_state);
3284 3285  }
3285 3286  #endif
3286 3287  
3287 3288  /* ------------------------------------------------------------------------ */
3288 3289  /* Function:    fr_delstate                                                 */
3289 3290  /* Returns:     int - 0 = entry deleted, else ref count on entry            */
3290 3291  /* Parameters:  is(I)  - pointer to state structure to delete               */
3291 3292  /*              why(I) - if not 0, log reason why it was deleted            */
3292 3293  /*              ifs    - ipf stack instance                                 */
3293 3294  /* Write Locks: ipf_state/ipf_global                                        */
3294 3295  /*                                                                          */
3295 3296  /* Deletes a state entry from the enumerated list as well as the hash table */
3296 3297  /* and timeout queue lists.  Make adjustments to hash table statistics and  */
3297 3298  /* global counters as required.                                             */
3298 3299  /* ------------------------------------------------------------------------ */
3299 3300  int fr_delstate(is, why, ifs)
3300 3301  ipstate_t *is;
3301 3302  int why;
3302 3303  ipf_stack_t *ifs;
3303 3304  {
3304 3305          int removed = 0;
3305 3306  
3306 3307          ASSERT(rw_write_held(&ifs->ifs_ipf_global.ipf_lk) == 0 ||
3307 3308                  rw_write_held(&ifs->ifs_ipf_state.ipf_lk) == 0);
3308 3309  
3309 3310          /*
3310 3311           * Start by removing the entry from the hash table of state entries
3311 3312           * so it will not be "used" again.
3312 3313           *
3313 3314           * It will remain in the "list" of state entries until all references
3314 3315           * have been accounted for.
3315 3316           */
3316 3317          if (is->is_phnext != NULL) {
3317 3318                  removed = 1;
3318 3319                  *is->is_phnext = is->is_hnext;
3319 3320                  if (is->is_hnext != NULL)
3320 3321                          is->is_hnext->is_phnext = is->is_phnext;
3321 3322                  if (ifs->ifs_ips_table[is->is_hv] == NULL)
3322 3323                          ifs->ifs_ips_stats.iss_inuse--;
3323 3324                  ifs->ifs_ips_stats.iss_bucketlen[is->is_hv]--;
3324 3325  
3325 3326                  is->is_phnext = NULL;
3326 3327                  is->is_hnext = NULL;
3327 3328          }
3328 3329  
3329 3330          /*
3330 3331           * Because ifs->ifs_ips_stats.iss_wild is a count of entries in the state
3331 3332           * table that have wildcard flags set, only decerement it once
3332 3333           * and do it here.
3333 3334           */
3334 3335          if (is->is_flags & (SI_WILDP|SI_WILDA)) {
3335 3336                  if (!(is->is_flags & SI_CLONED)) {
3336 3337                          ATOMIC_DECL(ifs->ifs_ips_stats.iss_wild);
3337 3338                  }
3338 3339                  is->is_flags &= ~(SI_WILDP|SI_WILDA);
3339 3340          }
3340 3341  
3341 3342          /*
3342 3343           * Next, remove it from the timeout queue it is in.
3343 3344           */
3344 3345          fr_deletequeueentry(&is->is_sti);
3345 3346  
3346 3347          is->is_me = NULL;
3347 3348  
3348 3349          /*
3349 3350           * If it is still in use by something else, do not go any further,
3350 3351           * but note that at this point it is now an orphan.
3351 3352           */
3352 3353          MUTEX_ENTER(&is->is_lock);
3353 3354          if (is->is_ref > 1) {
3354 3355                  is->is_ref--;
3355 3356                  MUTEX_EXIT(&is->is_lock);
3356 3357                  if (removed)
3357 3358                          ifs->ifs_ips_stats.iss_orphans++;
3358 3359                  return (is->is_ref);
3359 3360          }
3360 3361          MUTEX_EXIT(&is->is_lock);
3361 3362  
3362 3363          is->is_ref = 0;
3363 3364  
3364 3365          /*
3365 3366           * If entry has already been removed from table,
3366 3367           * it means we're simply cleaning up an orphan.
3367 3368           */
3368 3369          if (!removed)
3369 3370                  ifs->ifs_ips_stats.iss_orphans--;
3370 3371  
3371 3372          if (is->is_tqehead[0] != NULL)
3372 3373                  (void) fr_deletetimeoutqueue(is->is_tqehead[0]);
3373 3374  
3374 3375          if (is->is_tqehead[1] != NULL)
3375 3376                  (void) fr_deletetimeoutqueue(is->is_tqehead[1]);
3376 3377  
3377 3378  #ifdef  IPFILTER_SYNC
3378 3379          if (is->is_sync)
3379 3380                  ipfsync_del(is->is_sync);
3380 3381  #endif
3381 3382  #ifdef  IPFILTER_SCAN
3382 3383          (void) ipsc_detachis(is);
3383 3384  #endif
3384 3385  
3385 3386          /*
3386 3387           * Now remove it from master list of state table entries.
3387 3388           */
3388 3389          if (is->is_pnext != NULL) {
3389 3390                  *is->is_pnext = is->is_next;
3390 3391                  if (is->is_next != NULL) {
3391 3392                          is->is_next->is_pnext = is->is_pnext;
3392 3393                          is->is_next = NULL;
3393 3394                  }
3394 3395                  is->is_pnext = NULL;
3395 3396          }
3396 3397   
3397 3398          if (ifs->ifs_ipstate_logging != 0 && why != 0)
3398 3399                  ipstate_log(is, why, ifs);
3399 3400  
3400 3401          if (is->is_rule != NULL) {
3401 3402                  is->is_rule->fr_statecnt--;
3402 3403                  (void)fr_derefrule(&is->is_rule, ifs);
3403 3404          }
3404 3405  
3405 3406          MUTEX_DESTROY(&is->is_lock);
3406 3407          KFREE(is);
3407 3408          ifs->ifs_ips_num--;
3408 3409  
3409 3410          return (0);
3410 3411  }
3411 3412  
3412 3413  
3413 3414  /* ------------------------------------------------------------------------ */
3414 3415  /* Function:    fr_timeoutstate                                             */
3415 3416  /* Returns:     Nil                                                         */
3416 3417  /* Parameters:  ifs - ipf stack instance                                    */
3417 3418  /*                                                                          */
3418 3419  /* Slowly expire held state for thingslike UDP and ICMP.  The algorithm     */
3419 3420  /* used here is to keep the queue sorted with the oldest things at the top  */
3420 3421  /* and the youngest at the bottom.  So if the top one doesn't need to be    */
3421 3422  /* expired then neither will any under it.                                  */
3422 3423  /* ------------------------------------------------------------------------ */
3423 3424  void fr_timeoutstate(ifs)
3424 3425  ipf_stack_t *ifs;
3425 3426  {
3426 3427          ipftq_t *ifq, *ifqnext;
3427 3428          ipftqent_t *tqe, *tqn;
3428 3429          ipstate_t *is;
3429 3430          SPL_INT(s);
3430 3431  
3431 3432          SPL_NET(s);
3432 3433          WRITE_ENTER(&ifs->ifs_ipf_state);
3433 3434          for (ifq = ifs->ifs_ips_tqtqb; ifq != NULL; ifq = ifq->ifq_next)
3434 3435                  for (tqn = ifq->ifq_head; ((tqe = tqn) != NULL); ) {
3435 3436                          if (tqe->tqe_die > ifs->ifs_fr_ticks)
3436 3437                                  break;
3437 3438                          tqn = tqe->tqe_next;
3438 3439                          is = tqe->tqe_parent;
3439 3440                          (void) fr_delstate(is, ISL_EXPIRE, ifs);
3440 3441                  }
3441 3442  
3442 3443          for (ifq = ifs->ifs_ips_utqe; ifq != NULL; ifq = ifq->ifq_next) {
3443 3444                  for (tqn = ifq->ifq_head; ((tqe = tqn) != NULL); ) {
3444 3445                          if (tqe->tqe_die > ifs->ifs_fr_ticks)
3445 3446                                  break;
3446 3447                          tqn = tqe->tqe_next;
3447 3448                          is = tqe->tqe_parent;
3448 3449                          (void) fr_delstate(is, ISL_EXPIRE, ifs);
3449 3450                  }
3450 3451          }
3451 3452  
3452 3453          for (ifq = ifs->ifs_ips_utqe; ifq != NULL; ifq = ifqnext) {
3453 3454                  ifqnext = ifq->ifq_next;
3454 3455  
3455 3456                  if (((ifq->ifq_flags & IFQF_DELETE) != 0) &&
3456 3457                      (ifq->ifq_ref == 0)) {
3457 3458                          fr_freetimeoutqueue(ifq, ifs);
3458 3459                  }
3459 3460          }
3460 3461  
3461 3462          if (ifs->ifs_fr_state_doflush) {
3462 3463                  (void) fr_state_flush(FLUSH_TABLE_EXTRA, 0, ifs);
3463 3464                  ifs->ifs_fr_state_doflush = 0;
3464 3465          }
3465 3466          RWLOCK_EXIT(&ifs->ifs_ipf_state);
3466 3467          SPL_X(s);
3467 3468  }
3468 3469  
3469 3470  
3470 3471  /* ---------------------------------------------------------------------- */
3471 3472  /* Function:    fr_state_flush                                            */
3472 3473  /* Returns:     int - 0 == success, -1 == failure                         */
3473 3474  /* Parameters:  flush_option - how to flush the active State table        */
3474 3475  /*              proto    - IP version to flush (4, 6, or both)            */
3475 3476  /*              ifs      - ipf stack instance                             */
3476 3477  /* Write Locks: ipf_state                                                 */
3477 3478  /*                                                                        */
3478 3479  /* Flush state tables.  Three possible flush options currently defined:   */
3479 3480  /*                                                                        */
3480 3481  /* FLUSH_TABLE_ALL      : Flush all state table entries                   */
3481 3482  /*                                                                        */
3482 3483  /* FLUSH_TABLE_CLOSING  : Flush entries with TCP connections which        */
3483 3484  /*                        have started to close on both ends using        */
3484 3485  /*                        ipf_flushclosing().                             */
3485 3486  /*                                                                        */
3486 3487  /* FLUSH_TABLE_EXTRA    : First, flush entries which are "almost" closed. */
3487 3488  /*                        Then, if needed, flush entries with TCP         */
3488 3489  /*                        connections which have been idle for a long     */
3489 3490  /*                        time with ipf_extraflush().                     */
3490 3491  /* ---------------------------------------------------------------------- */
3491 3492  static int fr_state_flush(flush_option, proto, ifs)
3492 3493  int flush_option, proto;
3493 3494  ipf_stack_t *ifs;
3494 3495  {
3495 3496          ipstate_t *is, *isn;
3496 3497          int removed;
3497 3498          SPL_INT(s);
3498 3499  
3499 3500          removed = 0;
3500 3501  
3501 3502          SPL_NET(s);
3502 3503          switch (flush_option)
3503 3504          {
3504 3505          case FLUSH_TABLE_ALL:
3505 3506                  isn = ifs->ifs_ips_list;
3506 3507                  while ((is = isn) != NULL) {
3507 3508                          isn = is->is_next;
3508 3509                          if ((proto != 0) && (is->is_v != proto))
3509 3510                                  continue;
3510 3511                          if (fr_delstate(is, ISL_FLUSH, ifs) == 0)
3511 3512                                  removed++;
3512 3513                  }
3513 3514                  break;
3514 3515  
3515 3516          case FLUSH_TABLE_CLOSING:
3516 3517                  removed = ipf_flushclosing(STATE_FLUSH,
3517 3518                                             IPF_TCPS_CLOSE_WAIT,
3518 3519                                             ifs->ifs_ips_tqtqb,
3519 3520                                             ifs->ifs_ips_utqe,
3520 3521                                             ifs);
3521 3522                  break;
3522 3523  
3523 3524          case FLUSH_TABLE_EXTRA:
3524 3525                  removed = ipf_flushclosing(STATE_FLUSH,
3525 3526                                             IPF_TCPS_FIN_WAIT_2,
3526 3527                                             ifs->ifs_ips_tqtqb,
3527 3528                                             ifs->ifs_ips_utqe,
3528 3529                                             ifs);
3529 3530  
3530 3531                  /*
3531 3532                   * Be sure we haven't done this in the last 10 seconds.
3532 3533                   */
3533 3534                  if (ifs->ifs_fr_ticks - ifs->ifs_ips_last_force_flush <
3534 3535                      IPF_TTLVAL(10))
3535 3536                          break;
3536 3537                  ifs->ifs_ips_last_force_flush = ifs->ifs_fr_ticks;
3537 3538                  removed += ipf_extraflush(STATE_FLUSH,
3538 3539                                            &ifs->ifs_ips_tqtqb[IPF_TCPS_ESTABLISHED],
3539 3540                                            ifs->ifs_ips_utqe,
3540 3541                                            ifs);
3541 3542                  break;
3542 3543  
3543 3544          default: /* Flush Nothing */
3544 3545                  break;
3545 3546          }
3546 3547  
3547 3548          SPL_X(s);
3548 3549          return (removed);
3549 3550  }
3550 3551  
3551 3552  
3552 3553  /* ------------------------------------------------------------------------ */
3553 3554  /* Function:    fr_tcp_age                                                  */
3554 3555  /* Returns:     int - 1 == state transition made, 0 == no change (rejected) */
3555 3556  /* Parameters:  tq(I)    - pointer to timeout queue information             */
3556 3557  /*              fin(I)   - pointer to packet information                    */
3557 3558  /*              tqtab(I) - TCP timeout queue table this is in               */
3558 3559  /*              flags(I) - flags from state/NAT entry                       */
3559 3560  /*                                                                          */
3560 3561  /* Rewritten by Arjan de Vet <Arjan.deVet@adv.iae.nl>, 2000-07-29:          */
3561 3562  /*                                                                          */
3562 3563  /* - (try to) base state transitions on real evidence only,                 */
3563 3564  /*   i.e. packets that are sent and have been received by ipfilter;         */
3564 3565  /*   diagram 18.12 of TCP/IP volume 1 by W. Richard Stevens was used.       */
3565 3566  /*                                                                          */
3566 3567  /* - deal with half-closed connections correctly;                           */
3567 3568  /*                                                                          */
3568 3569  /* - store the state of the source in state[0] such that ipfstat            */
3569 3570  /*   displays the state as source/dest instead of dest/source; the calls    */
3570 3571  /*   to fr_tcp_age have been changed accordingly.                           */
3571 3572  /*                                                                          */
3572 3573  /* Internal Parameters:                                                     */
3573 3574  /*                                                                          */
3574 3575  /*    state[0] = state of source (host that initiated connection)           */
3575 3576  /*    state[1] = state of dest   (host that accepted the connection)        */
3576 3577  /*                                                                          */
3577 3578  /*    dir == 0 : a packet from source to dest                               */
3578 3579  /*    dir == 1 : a packet from dest to source                               */
3579 3580  /*                                                                          */
3580 3581  /* Locking: it is assumed that the parent of the tqe structure is locked.   */
3581 3582  /* ------------------------------------------------------------------------ */
3582 3583  int fr_tcp_age(tqe, fin, tqtab, flags)
3583 3584  ipftqent_t *tqe;
3584 3585  fr_info_t *fin;
3585 3586  ipftq_t *tqtab;
3586 3587  int flags;
3587 3588  {
3588 3589          int dlen, ostate, nstate, rval, dir;
3589 3590          u_char tcpflags;
3590 3591          tcphdr_t *tcp;
3591 3592          ipf_stack_t *ifs = fin->fin_ifs;
3592 3593  
3593 3594          tcp = fin->fin_dp;
3594 3595  
3595 3596          rval = 0;
3596 3597          dir = fin->fin_rev;
3597 3598          tcpflags = tcp->th_flags;
3598 3599          dlen = fin->fin_dlen - (TCP_OFF(tcp) << 2);
3599 3600  
3600 3601          ostate = tqe->tqe_state[1 - dir];
3601 3602          nstate = tqe->tqe_state[dir];
3602 3603  
3603 3604          DTRACE_PROBE4(
3604 3605                  indata,
3605 3606                  fr_info_t *, fin,
3606 3607                  int, ostate,
3607 3608                  int, nstate,
3608 3609                  u_char, tcpflags
3609 3610          );
3610 3611  
3611 3612          if (tcpflags & TH_RST) {
3612 3613                  if (!(tcpflags & TH_PUSH) && !dlen)
3613 3614                          nstate = IPF_TCPS_CLOSED;
3614 3615                  else
3615 3616                          nstate = IPF_TCPS_CLOSE_WAIT;
3616 3617  
3617 3618                  /*
3618 3619                   * Once RST is received, we must advance peer's state to
3619 3620                   * CLOSE_WAIT.
3620 3621                   */
3621 3622                  if (ostate <= IPF_TCPS_ESTABLISHED) {
3622 3623                          tqe->tqe_state[1 - dir] = IPF_TCPS_CLOSE_WAIT;
3623 3624                  }
3624 3625                  rval = 1;
3625 3626          } else {
3626 3627  
3627 3628                  switch (nstate)
3628 3629                  {
3629 3630                  case IPF_TCPS_LISTEN: /* 0 */
3630 3631                          if ((tcpflags & TH_OPENING) == TH_OPENING) {
3631 3632                                  /*
3632 3633                                   * 'dir' received an S and sends SA in
3633 3634                                   * response, CLOSED -> SYN_RECEIVED
3634 3635                                   */
3635 3636                                  nstate = IPF_TCPS_SYN_RECEIVED;
3636 3637                                  rval = 1;
3637 3638                          } else if ((tcpflags & TH_OPENING) == TH_SYN) {
3638 3639                                  /* 'dir' sent S, CLOSED -> SYN_SENT */
3639 3640                                  nstate = IPF_TCPS_SYN_SENT;
3640 3641                                  rval = 1;
3641 3642                          }
3642 3643                          /*
3643 3644                           * the next piece of code makes it possible to get
3644 3645                           * already established connections into the state table
3645 3646                           * after a restart or reload of the filter rules; this
3646 3647                           * does not work when a strict 'flags S keep state' is
3647 3648                           * used for tcp connections of course
3648 3649                           */
3649 3650                          if (((flags & IS_TCPFSM) == 0) &&
3650 3651                              ((tcpflags & TH_ACKMASK) == TH_ACK)) {
3651 3652                                  /*
3652 3653                                   * we saw an A, guess 'dir' is in ESTABLISHED
3653 3654                                   * mode
3654 3655                                   */
3655 3656                                  switch (ostate)
3656 3657                                  {
3657 3658                                  case IPF_TCPS_LISTEN :
3658 3659                                  case IPF_TCPS_SYN_RECEIVED :
3659 3660                                          nstate = IPF_TCPS_HALF_ESTAB;
3660 3661                                          rval = 1;
3661 3662                                          break;
3662 3663                                  case IPF_TCPS_HALF_ESTAB :
3663 3664                                  case IPF_TCPS_ESTABLISHED :
3664 3665                                          nstate = IPF_TCPS_ESTABLISHED;
3665 3666                                          rval = 1;
3666 3667                                          break;
3667 3668                                  default :
3668 3669                                          break;
3669 3670                                  }
3670 3671                          }
3671 3672                          /*
3672 3673                           * TODO: besides regular ACK packets we can have other
3673 3674                           * packets as well; it is yet to be determined how we
3674 3675                           * should initialize the states in those cases
3675 3676                           */
3676 3677                          break;
3677 3678  
3678 3679                  case IPF_TCPS_SYN_SENT: /* 1 */
3679 3680                          if ((tcpflags & ~(TH_ECN|TH_CWR)) == TH_SYN) {
3680 3681                                  /*
3681 3682                                   * A retransmitted SYN packet.  We do not reset
3682 3683                                   * the timeout here to fr_tcptimeout because a
3683 3684                                   * connection connect timeout does not renew
3684 3685                                   * after every packet that is sent.  We need to
3685 3686                                   * set rval so as to indicate the packet has
3686 3687                                   * passed the check for its flags being valid
3687 3688                                   * in the TCP FSM.  Setting rval to 2 has the
3688 3689                                   * result of not resetting the timeout.
3689 3690                                   */
3690 3691                                  rval = 2;
3691 3692                          } else if ((tcpflags & (TH_SYN|TH_FIN|TH_ACK)) ==
3692 3693                                     TH_ACK) {
3693 3694                                  /*
3694 3695                                   * we see an A from 'dir' which is in SYN_SENT
3695 3696                                   * state: 'dir' sent an A in response to an SA
3696 3697                                   * which it received, SYN_SENT -> ESTABLISHED
3697 3698                                   */
3698 3699                                  nstate = IPF_TCPS_ESTABLISHED;
3699 3700                                  rval = 1;
3700 3701                          } else if (tcpflags & TH_FIN) {
3701 3702                                  /*
3702 3703                                   * we see an F from 'dir' which is in SYN_SENT
3703 3704                                   * state and wants to close its side of the
3704 3705                                   * connection; SYN_SENT -> FIN_WAIT_1
3705 3706                                   */
3706 3707                                  nstate = IPF_TCPS_FIN_WAIT_1;
3707 3708                                  rval = 1;
3708 3709                          } else if ((tcpflags & TH_OPENING) == TH_OPENING) {
3709 3710                                  /*
3710 3711                                   * we see an SA from 'dir' which is already in
3711 3712                                   * SYN_SENT state, this means we have a
3712 3713                                   * simultaneous open; SYN_SENT -> SYN_RECEIVED
3713 3714                                   */
3714 3715                                  nstate = IPF_TCPS_SYN_RECEIVED;
3715 3716                                  rval = 1;
3716 3717                          }
3717 3718                          break;
3718 3719  
3719 3720                  case IPF_TCPS_SYN_RECEIVED: /* 2 */
3720 3721                          if ((tcpflags & (TH_SYN|TH_FIN|TH_ACK)) == TH_ACK) {
3721 3722                                  /*
3722 3723                                   * we see an A from 'dir' which was in
3723 3724                                   * SYN_RECEIVED state so it must now be in
3724 3725                                   * established state, SYN_RECEIVED ->
3725 3726                                   * ESTABLISHED
3726 3727                                   */
3727 3728                                  nstate = IPF_TCPS_ESTABLISHED;
3728 3729                                  rval = 1;
3729 3730                          } else if ((tcpflags & ~(TH_ECN|TH_CWR)) ==
3730 3731                                     TH_OPENING) {
3731 3732                                  /*
3732 3733                                   * We see an SA from 'dir' which is already in
3733 3734                                   * SYN_RECEIVED state.
3734 3735                                   */
3735 3736                                  rval = 2;
3736 3737                          } else if (tcpflags & TH_FIN) {
3737 3738                                  /*
3738 3739                                   * we see an F from 'dir' which is in
3739 3740                                   * SYN_RECEIVED state and wants to close its
3740 3741                                   * side of the connection; SYN_RECEIVED ->
3741 3742                                   * FIN_WAIT_1
3742 3743                                   */
3743 3744                                  nstate = IPF_TCPS_FIN_WAIT_1;
3744 3745                                  rval = 1;
3745 3746                          }
3746 3747                          break;
3747 3748  
3748 3749                  case IPF_TCPS_HALF_ESTAB: /* 3 */
3749 3750                          if (tcpflags & TH_FIN) {
3750 3751                                  nstate = IPF_TCPS_FIN_WAIT_1;
3751 3752                                  rval = 1;
3752 3753                          } else if ((tcpflags & TH_ACKMASK) == TH_ACK) {
3753 3754                                  /*
3754 3755                                   * If we've picked up a connection in mid
3755 3756                                   * flight, we could be looking at a follow on
3756 3757                                   * packet from the same direction as the one
3757 3758                                   * that created this state.  Recognise it but
3758 3759                                   * do not advance the entire connection's
3759 3760                                   * state.
3760 3761                                   */
3761 3762                                  switch (ostate)
3762 3763                                  {
3763 3764                                  case IPF_TCPS_LISTEN :
3764 3765                                  case IPF_TCPS_SYN_SENT :
3765 3766                                  case IPF_TCPS_SYN_RECEIVED :
3766 3767                                          rval = 1;
3767 3768                                          break;
3768 3769                                  case IPF_TCPS_HALF_ESTAB :
3769 3770                                  case IPF_TCPS_ESTABLISHED :
3770 3771                                          nstate = IPF_TCPS_ESTABLISHED;
3771 3772                                          rval = 1;
3772 3773                                          break;
3773 3774                                  default :
3774 3775                                          break;
3775 3776                                  }
3776 3777                          }
3777 3778                          break;
3778 3779  
3779 3780                  case IPF_TCPS_ESTABLISHED: /* 4 */
3780 3781                          rval = 1;
3781 3782                          if (tcpflags & TH_FIN) {
3782 3783                                  /*
3783 3784                                   * 'dir' closed its side of the connection;
3784 3785                                   * this gives us a half-closed connection;
3785 3786                                   * ESTABLISHED -> FIN_WAIT_1
3786 3787                                   */
3787 3788                                  if (ostate == IPF_TCPS_FIN_WAIT_1) {
3788 3789                                          nstate = IPF_TCPS_CLOSING;
3789 3790                                  } else {
3790 3791                                          nstate = IPF_TCPS_FIN_WAIT_1;
3791 3792                                  }
3792 3793                          } else if (tcpflags & TH_ACK) {
3793 3794                                  /*
3794 3795                                   * an ACK, should we exclude other flags here?
3795 3796                                   */
3796 3797                                  if (ostate == IPF_TCPS_FIN_WAIT_1) {
3797 3798                                          /*
3798 3799                                           * We know the other side did an active
3799 3800                                           * close, so we are ACKing the recvd
3800 3801                                           * FIN packet (does the window matching
3801 3802                                           * code guarantee this?) and go into
3802 3803                                           * CLOSE_WAIT state; this gives us a
3803 3804                                           * half-closed connection
3804 3805                                           */
3805 3806                                          nstate = IPF_TCPS_CLOSE_WAIT;
3806 3807                                  } else if (ostate < IPF_TCPS_CLOSE_WAIT) {
3807 3808                                          /*
3808 3809                                           * still a fully established
3809 3810                                           * connection reset timeout
3810 3811                                           */
3811 3812                                          nstate = IPF_TCPS_ESTABLISHED;
3812 3813                                  }
3813 3814                          }
3814 3815                          break;
3815 3816  
3816 3817                  case IPF_TCPS_CLOSE_WAIT: /* 5 */
3817 3818                          rval = 1;
3818 3819                          if (tcpflags & TH_FIN) {
3819 3820                                  /*
3820 3821                                   * application closed and 'dir' sent a FIN,
3821 3822                                   * we're now going into LAST_ACK state
3822 3823                                   */
3823 3824                                  nstate = IPF_TCPS_LAST_ACK;
3824 3825                          } else {
3825 3826                                  /*
3826 3827                                   * we remain in CLOSE_WAIT because the other
3827 3828                                   * side has closed already and we did not
3828 3829                                   * close our side yet; reset timeout
3829 3830                                   */
3830 3831                                  nstate = IPF_TCPS_CLOSE_WAIT;
3831 3832                          }
3832 3833                          break;
3833 3834  
3834 3835                  case IPF_TCPS_FIN_WAIT_1: /* 6 */
3835 3836                          rval = 1;
3836 3837                          if ((tcpflags & TH_ACK) &&
3837 3838                              ostate > IPF_TCPS_CLOSE_WAIT) {
3838 3839                                  /*
3839 3840                                   * if the other side is not active anymore
3840 3841                                   * it has sent us a FIN packet that we are
3841 3842                                   * ack'ing now with an ACK; this means both
3842 3843                                   * sides have now closed the connection and
3843 3844                                   * we go into LAST_ACK
3844 3845                                   */
3845 3846                                  /*
3846 3847                                   * XXX: how do we know we really are ACKing
3847 3848                                   * the FIN packet here? does the window code
3848 3849                                   * guarantee that?
3849 3850                                   */
3850 3851                                  nstate = IPF_TCPS_LAST_ACK;
3851 3852                          } else {
3852 3853                                  /*
3853 3854                                   * we closed our side of the connection
3854 3855                                   * already but the other side is still active
3855 3856                                   * (ESTABLISHED/CLOSE_WAIT); continue with
3856 3857                                   * this half-closed connection
3857 3858                                   */
3858 3859                                  nstate = IPF_TCPS_FIN_WAIT_1;
3859 3860                          }
3860 3861                          break;
3861 3862  
3862 3863                  case IPF_TCPS_CLOSING: /* 7 */
3863 3864                          if ((tcpflags & (TH_FIN|TH_ACK)) == TH_ACK) {
3864 3865                                  nstate = IPF_TCPS_TIME_WAIT;
3865 3866                          }
3866 3867                          rval = 1;
3867 3868                          break;
3868 3869  
3869 3870                  case IPF_TCPS_LAST_ACK: /* 8 */
3870 3871                          /*
3871 3872                           * We want to reset timer here to keep state in table.
3872 3873                           * If we would allow the state to time out here, while
3873 3874                           * there would still be packets being retransmitted, we
3874 3875                           * would cut off line between the two peers preventing
3875 3876                           * them to close connection properly. 
3876 3877                           */
3877 3878                          rval = 1;
3878 3879                          break;
3879 3880  
3880 3881                  case IPF_TCPS_FIN_WAIT_2: /* 9 */
3881 3882                          /* NOT USED */
3882 3883                          break;
3883 3884  
3884 3885                  case IPF_TCPS_TIME_WAIT: /* 10 */
3885 3886                          /* we're in 2MSL timeout now */
3886 3887                          if (ostate == IPF_TCPS_LAST_ACK) {
3887 3888                                  nstate = IPF_TCPS_CLOSED;
3888 3889                                  rval = 1;
3889 3890                          } else {
3890 3891                                  rval = 2;
3891 3892                          }
3892 3893                          break;
3893 3894  
3894 3895                  case IPF_TCPS_CLOSED: /* 11 */
3895 3896                          rval = 2;
3896 3897                          break;
3897 3898  
3898 3899                  default :
3899 3900  #if defined(_KERNEL)
3900 3901                          ASSERT(nstate >= IPF_TCPS_LISTEN &&
3901 3902                              nstate <= IPF_TCPS_CLOSED);
3902 3903  #else
3903 3904                          abort();
3904 3905  #endif
3905 3906                          break;
3906 3907                  }
3907 3908          }
3908 3909  
3909 3910          /*
3910 3911           * If rval == 2 then do not update the queue position, but treat the
3911 3912           * packet as being ok.
3912 3913           */
3913 3914          if (rval == 2) {
3914 3915                  DTRACE_PROBE1(state_keeping_timer, int, nstate);
3915 3916                  rval = 1;
3916 3917          }
3917 3918          else if (rval == 1) {
3918 3919                  tqe->tqe_state[dir] = nstate;
3919 3920                  /*
3920 3921                   * The nstate can either advance to a new state, or remain
3921 3922                   * unchanged, resetting the timer by moving to the bottom of
3922 3923                   * the queue.
3923 3924                   */
3924 3925                  DTRACE_PROBE1(state_done, int, nstate);
3925 3926  
3926 3927                  if ((tqe->tqe_flags & TQE_RULEBASED) == 0)
3927 3928                          fr_movequeue(tqe, tqe->tqe_ifq, tqtab + nstate, ifs);
3928 3929          }
3929 3930  
3930 3931          return rval;
3931 3932  }
3932 3933  
3933 3934  
3934 3935  /* ------------------------------------------------------------------------ */
3935 3936  /* Function:    ipstate_log                                                 */
3936 3937  /* Returns:     Nil                                                         */
3937 3938  /* Parameters:  is(I)   - pointer to state structure                        */
3938 3939  /*              type(I) - type of log entry to create                       */
3939 3940  /*                                                                          */
3940 3941  /* Creates a state table log entry using the state structure and type info. */
3941 3942  /* passed in.  Log packet/byte counts, source/destination address and other */
3942 3943  /* protocol specific information.                                           */
3943 3944  /* ------------------------------------------------------------------------ */
3944 3945  void ipstate_log(is, type, ifs)
3945 3946  struct ipstate *is;
3946 3947  u_int type;
3947 3948  ipf_stack_t *ifs;
3948 3949  {
3949 3950  #ifdef  IPFILTER_LOG
3950 3951          struct  ipslog  ipsl;
3951 3952          size_t sizes[1];
3952 3953          void *items[1];
3953 3954          int types[1];
3954 3955  
3955 3956          /*
3956 3957           * Copy information out of the ipstate_t structure and into the
3957 3958           * structure used for logging.
3958 3959           */
3959 3960          ipsl.isl_type = type;
3960 3961          ipsl.isl_pkts[0] = is->is_pkts[0] + is->is_icmppkts[0];
3961 3962          ipsl.isl_bytes[0] = is->is_bytes[0];
3962 3963          ipsl.isl_pkts[1] = is->is_pkts[1] + is->is_icmppkts[1];
3963 3964          ipsl.isl_bytes[1] = is->is_bytes[1];
3964 3965          ipsl.isl_pkts[2] = is->is_pkts[2] + is->is_icmppkts[2];
3965 3966          ipsl.isl_bytes[2] = is->is_bytes[2];
3966 3967          ipsl.isl_pkts[3] = is->is_pkts[3] + is->is_icmppkts[3];
3967 3968          ipsl.isl_bytes[3] = is->is_bytes[3];
3968 3969          ipsl.isl_src = is->is_src;
3969 3970          ipsl.isl_dst = is->is_dst;
3970 3971          ipsl.isl_p = is->is_p;
3971 3972          ipsl.isl_v = is->is_v;
3972 3973          ipsl.isl_flags = is->is_flags;
3973 3974          ipsl.isl_tag = is->is_tag;
3974 3975          ipsl.isl_rulen = is->is_rulen;
3975 3976          (void) strncpy(ipsl.isl_group, is->is_group, FR_GROUPLEN);
3976 3977  
3977 3978          if (ipsl.isl_p == IPPROTO_TCP || ipsl.isl_p == IPPROTO_UDP) {
3978 3979                  ipsl.isl_sport = is->is_sport;
3979 3980                  ipsl.isl_dport = is->is_dport;
3980 3981                  if (ipsl.isl_p == IPPROTO_TCP) {
3981 3982                          ipsl.isl_state[0] = is->is_state[0];
3982 3983                          ipsl.isl_state[1] = is->is_state[1];
3983 3984                  }
3984 3985          } else if (ipsl.isl_p == IPPROTO_ICMP) {
3985 3986                  ipsl.isl_itype = is->is_icmp.ici_type;
3986 3987          } else if (ipsl.isl_p == IPPROTO_ICMPV6) {
3987 3988                  ipsl.isl_itype = is->is_icmp.ici_type;
3988 3989          } else {
3989 3990                  ipsl.isl_ps.isl_filler[0] = 0;
3990 3991                  ipsl.isl_ps.isl_filler[1] = 0;
3991 3992          }
3992 3993  
3993 3994          items[0] = &ipsl;
3994 3995          sizes[0] = sizeof(ipsl);
3995 3996          types[0] = 0;
3996 3997  
3997 3998          if (ipllog(IPL_LOGSTATE, NULL, items, sizes, types, 1, ifs)) {
3998 3999                  ATOMIC_INCL(ifs->ifs_ips_stats.iss_logged);
3999 4000          } else {
4000 4001                  ATOMIC_INCL(ifs->ifs_ips_stats.iss_logfail);
4001 4002          }
4002 4003  #endif
4003 4004  }
4004 4005  
4005 4006  
4006 4007  #ifdef  USE_INET6
4007 4008  /* ------------------------------------------------------------------------ */
4008 4009  /* Function:    fr_checkicmp6matchingstate                                  */
4009 4010  /* Returns:     ipstate_t* - NULL == no match found,                        */
4010 4011  /*                           else  pointer to matching state entry          */
4011 4012  /* Parameters:  fin(I) - pointer to packet information                      */
4012 4013  /* Locks:       NULL == no locks, else Read Lock on ipf_state               */
4013 4014  /*                                                                          */
4014 4015  /* If we've got an ICMPv6 error message, using the information stored in    */
4015 4016  /* the ICMPv6 packet, look for a matching state table entry.                */
4016 4017  /* ------------------------------------------------------------------------ */
4017 4018  static ipstate_t *fr_checkicmp6matchingstate(fin)
4018 4019  fr_info_t *fin;
4019 4020  {
4020 4021          struct icmp6_hdr *ic6, *oic;
4021 4022          int backward, i;
4022 4023          ipstate_t *is, **isp;
4023 4024          u_short sport, dport;
4024 4025          i6addr_t dst, src;
4025 4026          u_short savelen;
4026 4027          icmpinfo_t *ic;
4027 4028          fr_info_t ofin;
4028 4029          tcphdr_t *tcp;
4029 4030          ip6_t *oip6;
4030 4031          u_char  pr;
4031 4032          u_int hv;
4032 4033          ipf_stack_t *ifs = fin->fin_ifs;
4033 4034  
4034 4035          /*
4035 4036           * Does it at least have the return (basic) IP header ?
4036 4037           * Is it an actual recognised ICMP error type?
4037 4038           * Only a basic IP header (no options) should be with
4038 4039           * an ICMP error header.
4039 4040           */
4040 4041          if ((fin->fin_v != 6) || (fin->fin_plen < ICMP6ERR_MINPKTLEN) ||
4041 4042              !(fin->fin_flx & FI_ICMPERR))
4042 4043                  return NULL;
4043 4044  
4044 4045          ic6 = fin->fin_dp;
4045 4046  
4046 4047          oip6 = (ip6_t *)((char *)ic6 + ICMPERR_ICMPHLEN);
4047 4048          if (fin->fin_plen < sizeof(*oip6))
4048 4049                  return NULL;
4049 4050  
4050 4051          bcopy((char *)fin, (char *)&ofin, sizeof(*fin));
4051 4052          ofin.fin_v = 6;
4052 4053          ofin.fin_ifp = fin->fin_ifp;
4053 4054          ofin.fin_out = !fin->fin_out;
4054 4055          ofin.fin_m = NULL;      /* if dereferenced, panic XXX */
4055 4056          ofin.fin_mp = NULL;     /* if dereferenced, panic XXX */
4056 4057  
4057 4058          /*
4058 4059           * We make a fin entry to be able to feed it to
4059 4060           * matchsrcdst. Note that not all fields are necessary
4060 4061           * but this is the cleanest way. Note further we fill
4061 4062           * in fin_mp such that if someone uses it we'll get
4062 4063           * a kernel panic. fr_matchsrcdst does not use this.
4063 4064           *
4064 4065           * watch out here, as ip is in host order and oip6 in network
4065 4066           * order. Any change we make must be undone afterwards.
4066 4067           */
4067 4068          savelen = oip6->ip6_plen;
4068 4069          oip6->ip6_plen = fin->fin_dlen - ICMPERR_ICMPHLEN;
4069 4070          ofin.fin_flx = FI_NOCKSUM;
4070 4071          ofin.fin_ip = (ip_t *)oip6;
4071 4072          ofin.fin_plen = oip6->ip6_plen;
4072 4073          (void) fr_makefrip(sizeof(*oip6), (ip_t *)oip6, &ofin);
4073 4074          ofin.fin_flx &= ~(FI_BAD|FI_SHORT);
4074 4075          oip6->ip6_plen = savelen;
4075 4076  
4076 4077          if (oip6->ip6_nxt == IPPROTO_ICMPV6) {
4077 4078                  oic = (struct icmp6_hdr *)(oip6 + 1);
4078 4079                  /*
4079 4080                   * an ICMP error can only be generated as a result of an
4080 4081                   * ICMP query, not as the response on an ICMP error
4081 4082                   *
4082 4083                   * XXX theoretically ICMP_ECHOREP and the other reply's are
4083 4084                   * ICMP query's as well, but adding them here seems strange XXX
4084 4085                   */
4085 4086                   if (!(oic->icmp6_type & ICMP6_INFOMSG_MASK))
4086 4087                          return NULL;
4087 4088  
4088 4089                  /*
4089 4090                   * perform a lookup of the ICMP packet in the state table
4090 4091                   */
4091 4092                  hv = (pr = oip6->ip6_nxt);
4092 4093                  src.in6 = oip6->ip6_src;
4093 4094                  hv += src.in4.s_addr;
4094 4095                  dst.in6 = oip6->ip6_dst;
4095 4096                  hv += dst.in4.s_addr;
4096 4097                  hv += oic->icmp6_id;
4097 4098                  hv += oic->icmp6_seq;
4098 4099                  hv = DOUBLE_HASH(hv, ifs);
4099 4100  
4100 4101                  READ_ENTER(&ifs->ifs_ipf_state);
4101 4102                  for (isp = &ifs->ifs_ips_table[hv]; ((is = *isp) != NULL); ) {
4102 4103                          ic = &is->is_icmp;
4103 4104                          isp = &is->is_hnext;
4104 4105                          if ((is->is_p == pr) &&
4105 4106                              !(is->is_pass & FR_NOICMPERR) &&
4106 4107                              (oic->icmp6_id == ic->ici_id) &&
4107 4108                              (oic->icmp6_seq == ic->ici_seq) &&
4108 4109                              (is = fr_matchsrcdst(&ofin, is, &src,
4109 4110                                                   &dst, NULL, FI_ICMPCMP))) {
4110 4111                                  /*
4111 4112                                   * in the state table ICMP query's are stored
4112 4113                                   * with the type of the corresponding ICMP
4113 4114                                   * response. Correct here
4114 4115                                   */
4115 4116                                  if (((ic->ici_type == ICMP6_ECHO_REPLY) &&
4116 4117                                       (oic->icmp6_type == ICMP6_ECHO_REQUEST)) ||
4117 4118                                       (ic->ici_type - 1 == oic->icmp6_type )) {
4118 4119                                          ifs->ifs_ips_stats.iss_hits++;
4119 4120                                          backward = IP6_NEQ(&is->is_dst, &src);
4120 4121                                          fin->fin_rev = !backward;
4121 4122                                          i = (backward << 1) + fin->fin_out;
4122 4123                                          is->is_icmppkts[i]++;
4123 4124                                          return is;
4124 4125                                  }
4125 4126                          }
4126 4127                  }
4127 4128                  RWLOCK_EXIT(&ifs->ifs_ipf_state);
4128 4129                  return NULL;
4129 4130          }
4130 4131  
4131 4132          hv = (pr = oip6->ip6_nxt);
4132 4133          src.in6 = oip6->ip6_src;
4133 4134          hv += src.i6[0];
4134 4135          hv += src.i6[1];
4135 4136          hv += src.i6[2];
4136 4137          hv += src.i6[3];
4137 4138          dst.in6 = oip6->ip6_dst;
4138 4139          hv += dst.i6[0];
4139 4140          hv += dst.i6[1];
4140 4141          hv += dst.i6[2];
4141 4142          hv += dst.i6[3];
4142 4143  
4143 4144          if ((oip6->ip6_nxt == IPPROTO_TCP) || (oip6->ip6_nxt == IPPROTO_UDP)) {
4144 4145                  tcp = (tcphdr_t *)(oip6 + 1);
4145 4146                  dport = tcp->th_dport;
4146 4147                  sport = tcp->th_sport;
4147 4148                  hv += dport;
4148 4149                  hv += sport;
4149 4150          } else
4150 4151                  tcp = NULL;
4151 4152          hv = DOUBLE_HASH(hv, ifs);
4152 4153  
4153 4154          READ_ENTER(&ifs->ifs_ipf_state);
4154 4155          for (isp = &ifs->ifs_ips_table[hv]; ((is = *isp) != NULL); ) {
4155 4156                  isp = &is->is_hnext;
4156 4157                  /*
4157 4158                   * Only allow this icmp though if the
4158 4159                   * encapsulated packet was allowed through the
4159 4160                   * other way around. Note that the minimal amount
4160 4161                   * of info present does not allow for checking against
4161 4162                   * tcp internals such as seq and ack numbers.
4162 4163                   */
4163 4164                  if ((is->is_p != pr) || (is->is_v != 6) ||
4164 4165                      (is->is_pass & FR_NOICMPERR))
4165 4166                          continue;
4166 4167                  is = fr_matchsrcdst(&ofin, is, &src, &dst, tcp, FI_ICMPCMP);
4167 4168                  if (is != NULL) {
4168 4169                          ifs->ifs_ips_stats.iss_hits++;
4169 4170                          backward = IP6_NEQ(&is->is_dst, &src);
4170 4171                          fin->fin_rev = !backward;
4171 4172                          i = (backward << 1) + fin->fin_out;
4172 4173                          is->is_icmppkts[i]++;
4173 4174                          /*
4174 4175                           * we deliberately do not touch the timeouts
4175 4176                           * for the accompanying state table entry.
4176 4177                           * It remains to be seen if that is correct. XXX
4177 4178                           */
4178 4179                          return is;
4179 4180                  }
4180 4181          }
4181 4182          RWLOCK_EXIT(&ifs->ifs_ipf_state);
4182 4183          return NULL;
4183 4184  }
4184 4185  #endif
4185 4186  
4186 4187  
4187 4188  /* ------------------------------------------------------------------------ */
4188 4189  /* Function:    fr_sttab_init                                               */
4189 4190  /* Returns:     Nil                                                         */
4190 4191  /* Parameters:  tqp(I) - pointer to an array of timeout queues for TCP      */
4191 4192  /*                                                                          */
4192 4193  /* Initialise the array of timeout queues for TCP.                          */
4193 4194  /* ------------------------------------------------------------------------ */
4194 4195  void fr_sttab_init(tqp, ifs)
4195 4196  ipftq_t *tqp;
4196 4197  ipf_stack_t *ifs;
4197 4198  {
4198 4199          int i;
4199 4200  
4200 4201          for (i = IPF_TCP_NSTATES - 1; i >= 0; i--) {
4201 4202                  tqp[i].ifq_ttl = 0;
4202 4203                  tqp[i].ifq_ref = 1;
4203 4204                  tqp[i].ifq_head = NULL;
4204 4205                  tqp[i].ifq_tail = &tqp[i].ifq_head;
4205 4206                  tqp[i].ifq_next = tqp + i + 1;
4206 4207                  MUTEX_INIT(&tqp[i].ifq_lock, "ipftq tcp tab");
4207 4208          }
4208 4209          tqp[IPF_TCP_NSTATES - 1].ifq_next = NULL;
4209 4210          tqp[IPF_TCPS_CLOSED].ifq_ttl = ifs->ifs_fr_tcpclosed;
4210 4211          tqp[IPF_TCPS_LISTEN].ifq_ttl = ifs->ifs_fr_tcptimeout;
4211 4212          tqp[IPF_TCPS_SYN_SENT].ifq_ttl = ifs->ifs_fr_tcptimeout;
4212 4213          tqp[IPF_TCPS_SYN_RECEIVED].ifq_ttl = ifs->ifs_fr_tcptimeout;
4213 4214          tqp[IPF_TCPS_ESTABLISHED].ifq_ttl = ifs->ifs_fr_tcpidletimeout;
4214 4215          tqp[IPF_TCPS_CLOSE_WAIT].ifq_ttl = ifs->ifs_fr_tcphalfclosed;
4215 4216          tqp[IPF_TCPS_FIN_WAIT_1].ifq_ttl = ifs->ifs_fr_tcphalfclosed;
4216 4217          tqp[IPF_TCPS_CLOSING].ifq_ttl = ifs->ifs_fr_tcptimeout;
4217 4218          tqp[IPF_TCPS_LAST_ACK].ifq_ttl = ifs->ifs_fr_tcplastack;
4218 4219          tqp[IPF_TCPS_FIN_WAIT_2].ifq_ttl = ifs->ifs_fr_tcpclosewait;
4219 4220          tqp[IPF_TCPS_TIME_WAIT].ifq_ttl = ifs->ifs_fr_tcptimeout;
4220 4221          tqp[IPF_TCPS_HALF_ESTAB].ifq_ttl = ifs->ifs_fr_tcptimeout;
4221 4222  }
4222 4223  
4223 4224  
4224 4225  /* ------------------------------------------------------------------------ */
4225 4226  /* Function:    fr_sttab_destroy                                            */
4226 4227  /* Returns:     Nil                                                         */
4227 4228  /* Parameters:  tqp(I) - pointer to an array of timeout queues for TCP      */
4228 4229  /*                                                                          */
4229 4230  /* Do whatever is necessary to "destroy" each of the entries in the array   */
4230 4231  /* of timeout queues for TCP.                                               */
4231 4232  /* ------------------------------------------------------------------------ */
4232 4233  void fr_sttab_destroy(tqp)
4233 4234  ipftq_t *tqp;
4234 4235  {
4235 4236          int i;
4236 4237  
4237 4238          for (i = IPF_TCP_NSTATES - 1; i >= 0; i--)
4238 4239                  MUTEX_DESTROY(&tqp[i].ifq_lock);
4239 4240  }
4240 4241  
4241 4242  
4242 4243  /* ------------------------------------------------------------------------ */
4243 4244  /* Function:    fr_statederef                                               */
4244 4245  /* Returns:     Nil                                                         */
4245 4246  /* Parameters:  isp(I) - pointer to pointer to state table entry            */
4246 4247  /*              ifs - ipf stack instance                                    */
4247 4248  /*                                                                          */
4248 4249  /* Decrement the reference counter for this state table entry and free it   */
4249 4250  /* if there are no more things using it.                                    */
4250 4251  /*                                                                          */
4251 4252  /* Internal parameters:                                                     */
4252 4253  /*    state[0] = state of source (host that initiated connection)           */
4253 4254  /*    state[1] = state of dest   (host that accepted the connection)        */
4254 4255  /* ------------------------------------------------------------------------ */
4255 4256  void fr_statederef(isp, ifs)
4256 4257  ipstate_t **isp;
4257 4258  ipf_stack_t *ifs;
4258 4259  {
4259 4260          ipstate_t *is;
4260 4261  
4261 4262          is = *isp;
4262 4263          *isp = NULL;
4263 4264  
4264 4265          MUTEX_ENTER(&is->is_lock);
4265 4266          if (is->is_ref > 1) {
4266 4267                  is->is_ref--;
4267 4268                  MUTEX_EXIT(&is->is_lock);
4268 4269  #ifndef _KERNEL
4269 4270                  if ((is->is_sti.tqe_state[0] > IPF_TCPS_ESTABLISHED) ||
4270 4271                     (is->is_sti.tqe_state[1] > IPF_TCPS_ESTABLISHED)) {
4271 4272                          (void) fr_delstate(is, ISL_ORPHAN, ifs);
4272 4273                  }
4273 4274  #endif
4274 4275                  return;
4275 4276          }
4276 4277          MUTEX_EXIT(&is->is_lock);
4277 4278  
4278 4279          WRITE_ENTER(&ifs->ifs_ipf_state);
4279 4280          (void) fr_delstate(is, ISL_EXPIRE, ifs);
4280 4281          RWLOCK_EXIT(&ifs->ifs_ipf_state);
4281 4282  }
4282 4283  
4283 4284  
4284 4285  /* ------------------------------------------------------------------------ */
4285 4286  /* Function:    fr_setstatequeue                                            */
4286 4287  /* Returns:     Nil                                                         */
4287 4288  /* Parameters:  is(I) - pointer to state structure                          */
4288 4289  /*              rev(I) - forward(0) or reverse(1) direction                 */
4289 4290  /* Locks:       ipf_state (read or write)                                   */
4290 4291  /*                                                                          */
4291 4292  /* Put the state entry on its default queue entry, using rev as a helped in */
4292 4293  /* determining which queue it should be placed on.                          */
4293 4294  /* ------------------------------------------------------------------------ */
4294 4295  void fr_setstatequeue(is, rev, ifs)
4295 4296  ipstate_t *is;
4296 4297  int rev;
4297 4298  ipf_stack_t *ifs;
4298 4299  {
4299 4300          ipftq_t *oifq, *nifq;
4300 4301  
4301 4302  
4302 4303          if ((is->is_sti.tqe_flags & TQE_RULEBASED) != 0)
4303 4304                  nifq = is->is_tqehead[rev];
4304 4305          else
4305 4306                  nifq = NULL;
4306 4307  
4307 4308          if (nifq == NULL) {
4308 4309                  switch (is->is_p)
4309 4310                  {
4310 4311  #ifdef USE_INET6
4311 4312                  case IPPROTO_ICMPV6 :
4312 4313                          if (rev == 1)
4313 4314                                  nifq = &ifs->ifs_ips_icmpacktq;
4314 4315                          else
4315 4316                                  nifq = &ifs->ifs_ips_icmptq;
4316 4317                          break;
4317 4318  #endif
4318 4319                  case IPPROTO_ICMP :
4319 4320                          if (rev == 1)
4320 4321                                  nifq = &ifs->ifs_ips_icmpacktq;
4321 4322                          else
4322 4323                                  nifq = &ifs->ifs_ips_icmptq;
4323 4324                          break;
4324 4325                  case IPPROTO_TCP :
4325 4326                          nifq = ifs->ifs_ips_tqtqb + is->is_state[rev];
4326 4327                          break;
4327 4328  
4328 4329                  case IPPROTO_UDP :
4329 4330                          if (rev == 1)
4330 4331                                  nifq = &ifs->ifs_ips_udpacktq;
4331 4332                          else
4332 4333                                  nifq = &ifs->ifs_ips_udptq;
4333 4334                          break;
4334 4335  
4335 4336                  default :
4336 4337                          nifq = &ifs->ifs_ips_iptq;
4337 4338                          break;
4338 4339                  }
4339 4340          }
4340 4341  
4341 4342          oifq = is->is_sti.tqe_ifq;
4342 4343          /*
4343 4344           * If it's currently on a timeout queue, move it from one queue to
4344 4345           * another, else put it on the end of the newly determined queue.
4345 4346           */
4346 4347          if (oifq != NULL)
4347 4348                  fr_movequeue(&is->is_sti, oifq, nifq, ifs);
4348 4349          else
4349 4350                  fr_queueappend(&is->is_sti, nifq, is, ifs);
4350 4351          return;
4351 4352  }
4352 4353  
4353 4354  
4354 4355  /* ------------------------------------------------------------------------ */
4355 4356  /* Function:    fr_stateiter                                                */
4356 4357  /* Returns:     int - 0 == success, else error                              */
4357 4358  /* Parameters:  token(I) - pointer to ipftoken structure                    */
4358 4359  /*              itp(I)   - pointer to ipfgeniter structure                  */
4359 4360  /*                                                                          */
4360 4361  /* This function handles the SIOCGENITER ioctl for the state tables and     */
4361 4362  /* walks through the list of entries in the state table list (ips_list.)    */
4362 4363  /* ------------------------------------------------------------------------ */
4363 4364  static int fr_stateiter(token, itp, ifs)
4364 4365  ipftoken_t *token;
4365 4366  ipfgeniter_t *itp;
4366 4367  ipf_stack_t *ifs;
4367 4368  {
4368 4369          ipstate_t *is, *next, zero;
4369 4370          int error, count;
4370 4371          char *dst;
4371 4372  
4372 4373          if (itp->igi_data == NULL)
4373 4374                  return EFAULT;
4374 4375  
4375 4376          if (itp->igi_nitems == 0)
4376 4377                  return EINVAL;
4377 4378  
4378 4379          if (itp->igi_type != IPFGENITER_STATE)
4379 4380                  return EINVAL;
4380 4381  
4381 4382          error = 0;
4382 4383  
4383 4384          READ_ENTER(&ifs->ifs_ipf_state);
4384 4385  
4385 4386          /*
4386 4387           * Get "previous" entry from the token and find the next entry.
4387 4388           */
4388 4389          is = token->ipt_data;
4389 4390          if (is == NULL) {
4390 4391                  next = ifs->ifs_ips_list;
4391 4392          } else {
4392 4393                  next = is->is_next;
4393 4394          }
4394 4395  
4395 4396          dst = itp->igi_data;
4396 4397          for (count = itp->igi_nitems; count > 0; count--) {
4397 4398                  /*
4398 4399                   * If we found an entry, add a reference to it and update the token.
4399 4400                   * Otherwise, zero out data to be returned and NULL out token.
4400 4401                   */
4401 4402                  if (next != NULL) {
4402 4403                          MUTEX_ENTER(&next->is_lock);
4403 4404                          next->is_ref++;
4404 4405                          MUTEX_EXIT(&next->is_lock);
4405 4406                          token->ipt_data = next;
4406 4407                  } else {
4407 4408                          bzero(&zero, sizeof(zero));
4408 4409                          next = &zero;
4409 4410                          token->ipt_data = NULL;
4410 4411                  }
4411 4412  
4412 4413                  /*
4413 4414                   * Safe to release lock now the we have a reference.
4414 4415                   */
4415 4416                  RWLOCK_EXIT(&ifs->ifs_ipf_state);
4416 4417  
4417 4418                  /*
4418 4419                   * Copy out data and clean up references and tokens.
4419 4420                   */
4420 4421                  error = COPYOUT(next, dst, sizeof(*next));
4421 4422                  if (error != 0)
4422 4423                          error = EFAULT;
4423 4424                  if (token->ipt_data == NULL) {
4424 4425                          ipf_freetoken(token, ifs);
4425 4426                          break;
4426 4427                  } else {
4427 4428                          if (is != NULL)
4428 4429                                  fr_statederef(&is, ifs);
4429 4430                          if (next->is_next == NULL) {
4430 4431                                  ipf_freetoken(token, ifs);
4431 4432                                  break;
4432 4433                          }
4433 4434                  }
4434 4435  
4435 4436                  if ((count == 1) || (error != 0))
4436 4437                          break;
4437 4438  
4438 4439                  READ_ENTER(&ifs->ifs_ipf_state);
4439 4440                  dst += sizeof(*next);
4440 4441                  is = next;
4441 4442                  next = is->is_next;
4442 4443          }
4443 4444  
4444 4445          return error;
4445 4446  }
  
    | 
      ↓ open down ↓ | 
    2530 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX