1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Copyright 2018 Nexenta Systems, Inc.
29 */
30
31 #include <sys/systm.h>
32 #include <sys/sdt.h>
33 #include <rpc/types.h>
34 #include <rpc/auth.h>
35 #include <rpc/auth_unix.h>
36 #include <rpc/auth_des.h>
37 #include <rpc/svc.h>
38 #include <rpc/xdr.h>
39 #include <nfs/nfs4.h>
40 #include <nfs/nfs_dispatch.h>
41 #include <nfs/nfs4_drc.h>
42
43 #define NFS4_MAX_MINOR_VERSION 0
44
45 /*
46 * The default size of the duplicate request cache
47 */
48 uint32_t nfs4_drc_max = 8 * 1024;
49
50 /*
51 * The number of buckets we'd like to hash the
52 * replies into.. do not change this on the fly.
53 */
54 uint32_t nfs4_drc_hash = 541;
55
56 static void rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp);
57
58 extern zone_key_t rfs4_zone_key;
59
60 /*
61 * Initialize a duplicate request cache.
62 */
63 rfs4_drc_t *
64 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size)
65 {
66 rfs4_drc_t *drc;
67 uint32_t bki;
68
69 ASSERT(drc_size);
70 ASSERT(drc_hash_size);
71
72 drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
73
74 drc->max_size = drc_size;
75 drc->in_use = 0;
76
77 mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
78
79 drc->dr_hash = drc_hash_size;
80
81 drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
82
83 for (bki = 0; bki < drc_hash_size; bki++) {
84 list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
85 offsetof(rfs4_dupreq_t, dr_bkt_next));
86 }
87
88 list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
89 offsetof(rfs4_dupreq_t, dr_next));
90
91 return (drc);
92 }
93
94 /*
95 * Destroy a duplicate request cache.
96 */
97 void
98 rfs4_fini_drc(void)
99 {
100 nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
101 rfs4_drc_t *drc = nsrv4->nfs4_drc;
102 rfs4_dupreq_t *drp, *drp_next;
103
104 /* iterate over the dr_cache and free the enties */
105 for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
106
107 if (drp->dr_state == NFS4_DUP_REPLAY)
108 rfs4_compound_free(&(drp->dr_res));
109
110 if (drp->dr_addr.buf != NULL)
111 kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
112
113 drp_next = list_next(&(drc->dr_cache), drp);
114
115 kmem_free(drp, sizeof (rfs4_dupreq_t));
116 }
117
118 mutex_destroy(&drc->lock);
119 kmem_free(drc->dr_buckets,
120 sizeof (list_t)*drc->dr_hash);
121 kmem_free(drc, sizeof (rfs4_drc_t));
122 }
123
124 /*
125 * rfs4_dr_chstate:
126 *
127 * Change the state of a rfs4_dupreq. If it's not in transition
128 * to the FREE state, return. If we are moving to the FREE state
129 * then we need to clean up the compound results and move the entry
130 * to the end of the list.
131 */
132 void
133 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
134 {
135 rfs4_drc_t *drc;
136
137 ASSERT(drp);
138 ASSERT(drp->drc);
139 ASSERT(drp->dr_bkt);
140 ASSERT(MUTEX_HELD(&drp->drc->lock));
141
142 drp->dr_state = new_state;
143
144 if (new_state != NFS4_DUP_FREE)
145 return;
146
147 drc = drp->drc;
148
149 /*
150 * Remove entry from the bucket and
151 * dr_cache list, free compound results.
152 */
153 list_remove(drp->dr_bkt, drp);
154 list_remove(&(drc->dr_cache), drp);
155 rfs4_compound_free(&(drp->dr_res));
156 }
157
158 /*
159 * rfs4_alloc_dr:
160 *
161 * Malloc a new one if we have not reached our maximum cache
162 * limit, otherwise pick an entry off the tail -- Use if it
163 * is marked as NFS4_DUP_FREE, or is an entry in the
164 * NFS4_DUP_REPLAY state.
165 */
166 rfs4_dupreq_t *
167 rfs4_alloc_dr(rfs4_drc_t *drc)
168 {
169 rfs4_dupreq_t *drp_tail, *drp = NULL;
170
171 ASSERT(drc);
172 ASSERT(MUTEX_HELD(&drc->lock));
173
174 /*
175 * Have we hit the cache limit yet ?
176 */
177 if (drc->in_use < drc->max_size) {
178 /*
179 * nope, so let's malloc a new one
180 */
181 drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
182 drp->drc = drc;
183 drc->in_use++;
184 DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
185 return (drp);
186 }
187
188 /*
189 * Cache is all allocated now traverse the list
190 * backwards to find one we can reuse.
191 */
192 for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL;
193 drp_tail = list_prev(&drc->dr_cache, drp_tail)) {
194
195 switch (drp_tail->dr_state) {
196
197 case NFS4_DUP_FREE:
198 list_remove(&(drc->dr_cache), drp_tail);
199 DTRACE_PROBE1(nfss__i__drc_freeclaim,
200 rfs4_dupreq_t *, drp_tail);
201 return (drp_tail);
202 /* NOTREACHED */
203
204 case NFS4_DUP_REPLAY:
205 /* grab it. */
206 rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
207 DTRACE_PROBE1(nfss__i__drc_replayclaim,
208 rfs4_dupreq_t *, drp_tail);
209 return (drp_tail);
210 /* NOTREACHED */
211 }
212 }
213 DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc);
214 return (NULL);
215 }
216
217 /*
218 * rfs4_find_dr:
219 *
220 * Search for an entry in the duplicate request cache by
221 * calculating the hash index based on the XID, and examining
222 * the entries in the hash bucket. If we find a match, return.
223 * Once we have searched the bucket we call rfs4_alloc_dr() to
224 * allocate a new entry, or reuse one that is available.
225 */
226 int
227 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
228 {
229
230 uint32_t the_xid;
231 list_t *dr_bkt;
232 rfs4_dupreq_t *drp;
233 int bktdex;
234
235 /*
236 * Get the XID, calculate the bucket and search to
237 * see if we need to replay from the cache.
238 */
239 the_xid = req->rq_xprt->xp_xid;
240 bktdex = the_xid % drc->dr_hash;
241
242 dr_bkt = (list_t *)
243 &(drc->dr_buckets[(the_xid % drc->dr_hash)]);
244
245 DTRACE_PROBE3(nfss__i__drc_bktdex,
246 int, bktdex,
247 uint32_t, the_xid,
248 list_t *, dr_bkt);
249
250 *dup = NULL;
251
252 mutex_enter(&drc->lock);
253 /*
254 * Search the bucket for a matching xid and address.
255 */
256 for (drp = list_head(dr_bkt); drp != NULL;
257 drp = list_next(dr_bkt, drp)) {
258
259 if (drp->dr_xid == the_xid &&
260 drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
261 bcmp((caddr_t)drp->dr_addr.buf,
262 (caddr_t)req->rq_xprt->xp_rtaddr.buf,
263 drp->dr_addr.len) == 0) {
264
265 /*
266 * Found a match so REPLAY the Reply
267 */
268 if (drp->dr_state == NFS4_DUP_REPLAY) {
269 rfs4_dr_chstate(drp, NFS4_DUP_INUSE);
270 mutex_exit(&drc->lock);
271 *dup = drp;
272 DTRACE_PROBE1(nfss__i__drc_replay,
273 rfs4_dupreq_t *, drp);
274 return (NFS4_DUP_REPLAY);
275 }
276
277 /*
278 * This entry must be in transition, so return
279 * the 'pending' status.
280 */
281 mutex_exit(&drc->lock);
282 return (NFS4_DUP_PENDING);
283 }
284 }
285
286 drp = rfs4_alloc_dr(drc);
287 mutex_exit(&drc->lock);
288
289 /*
290 * The DRC is full and all entries are in use. Upper function
291 * should error out this request and force the client to
292 * retransmit -- effectively this is a resource issue. NFSD
293 * threads tied up with native File System, or the cache size
294 * is too small for the server load.
295 */
296 if (drp == NULL)
297 return (NFS4_DUP_ERROR);
298
299 /*
300 * Init the state to NEW.
301 */
302 drp->dr_state = NFS4_DUP_NEW;
303
304 /*
305 * If needed, resize the address buffer
306 */
307 if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
308 if (drp->dr_addr.buf != NULL)
309 kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
310 drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
311 drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
312 if (drp->dr_addr.buf == NULL) {
313 /*
314 * If the malloc fails, mark the entry
315 * as free and put on the tail.
316 */
317 drp->dr_addr.maxlen = 0;
318 drp->dr_state = NFS4_DUP_FREE;
319 mutex_enter(&drc->lock);
320 list_insert_tail(&(drc->dr_cache), drp);
321 mutex_exit(&drc->lock);
322 return (NFS4_DUP_ERROR);
323 }
324 }
325
326
327 /*
328 * Copy the address.
329 */
330 drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
331
332 bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
333 (caddr_t)drp->dr_addr.buf,
334 drp->dr_addr.len);
335
336 drp->dr_xid = the_xid;
337 drp->dr_bkt = dr_bkt;
338
339 /*
340 * Insert at the head of the bucket and
341 * the drc lists..
342 */
343 mutex_enter(&drc->lock);
344 list_insert_head(&drc->dr_cache, drp);
345 list_insert_head(dr_bkt, drp);
346 mutex_exit(&drc->lock);
347
348 *dup = drp;
349
350 return (NFS4_DUP_NEW);
351 }
352
353 /*
354 *
355 * This function handles the duplicate request cache,
356 * NULL_PROC and COMPOUND procedure calls for NFSv4;
357 *
358 * Passed into this function are:-
359 *
360 * disp A pointer to our dispatch table entry
361 * req The request to process
362 * xprt The server transport handle
363 * ap A pointer to the arguments
364 * rlen A pointer to the reply length (output)
365 *
366 *
367 * When appropriate this function is responsible for inserting
368 * the reply into the duplicate cache or replaying an existing
369 * cached reply.
370 *
371 * dr_stat reflects the state of the duplicate request that
372 * has been inserted into or retrieved from the cache
373 *
374 * drp is the duplicate request entry
375 *
376 */
377 int
378 rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req,
379 SVCXPRT *xprt, char *ap, size_t *rlen)
380 {
381
382 COMPOUND4res res_buf;
383 COMPOUND4res *rbp;
384 COMPOUND4args *cap;
385 cred_t *cr = NULL;
386 int error = 0;
387 int dis_flags = 0;
388 int dr_stat = NFS4_NOT_DUP;
389 rfs4_dupreq_t *drp = NULL;
390 int rv;
391 nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
392 rfs4_drc_t *nfs4_drc = nsrv4->nfs4_drc;
393
394 ASSERT(disp);
395
396 /*
397 * Short circuit the RPC_NULL proc.
398 */
399 if (disp->dis_proc == rpc_null) {
400 DTRACE_NFSV4_1(null__start, struct svc_req *, req);
401 if (!svc_sendreply(xprt, xdr_void, NULL)) {
402 DTRACE_NFSV4_1(null__done, struct svc_req *, req);
403 svcerr_systemerr(xprt);
404 return (1);
405 }
406 DTRACE_NFSV4_1(null__done, struct svc_req *, req);
407 *rlen = xdr_sizeof(xdr_void, NULL);
408 return (0);
409 }
410
411 /* Only NFSv4 Compounds from this point onward */
412
413 rbp = &res_buf;
414 cap = (COMPOUND4args *)ap;
415
416 /*
417 * Update kstats
418 */
419 rfs4_compound_kstat_args(cap);
420
421 /*
422 * Figure out the disposition of the whole COMPOUND
423 * and record it's IDEMPOTENTCY.
424 */
425 rfs4_compound_flagproc(cap, &dis_flags);
426
427 /*
428 * If NON-IDEMPOTENT then we need to figure out if this
429 * request can be replied from the duplicate cache.
430 *
431 * If this is a new request then we need to insert the
432 * reply into the duplicate cache.
433 */
434 if (!(dis_flags & RPC_IDEMPOTENT)) {
435 /* look for a replay from the cache or allocate */
436 dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
437
438 switch (dr_stat) {
439
440 case NFS4_DUP_ERROR:
441 rfs4_resource_err(req, cap);
442 return (1);
443 /* NOTREACHED */
444
445 case NFS4_DUP_PENDING:
446 /*
447 * reply has previously been inserted into the
448 * duplicate cache, however the reply has
449 * not yet been sent via svc_sendreply()
450 */
451 return (1);
452 /* NOTREACHED */
453
454 case NFS4_DUP_NEW:
455 curthread->t_flag |= T_DONTPEND;
456 /* NON-IDEMPOTENT proc call */
457 rfs4_compound(cap, rbp, NULL, req, cr, &rv);
458 curthread->t_flag &= ~T_DONTPEND;
459
460 if (rv) /* short ckt sendreply on error */
461 return (rv);
462
463 /*
464 * dr_res must be initialized before calling
465 * rfs4_dr_chstate (it frees the reply).
466 */
467 drp->dr_res = res_buf;
468 if (curthread->t_flag & T_WOULDBLOCK) {
469 curthread->t_flag &= ~T_WOULDBLOCK;
470 /*
471 * mark this entry as FREE and plop
472 * on the end of the cache list
473 */
474 mutex_enter(&drp->drc->lock);
475 rfs4_dr_chstate(drp, NFS4_DUP_FREE);
476 list_insert_tail(&(drp->drc->dr_cache), drp);
477 mutex_exit(&drp->drc->lock);
478 return (1);
479 }
480 break;
481
482 case NFS4_DUP_REPLAY:
483 /* replay from the cache */
484 rbp = &(drp->dr_res);
485 break;
486 }
487 } else {
488 curthread->t_flag |= T_DONTPEND;
489 /* IDEMPOTENT proc call */
490 rfs4_compound(cap, rbp, NULL, req, cr, &rv);
491 curthread->t_flag &= ~T_DONTPEND;
492
493 if (rv) /* short ckt sendreply on error */
494 return (rv);
495
496 if (curthread->t_flag & T_WOULDBLOCK) {
497 curthread->t_flag &= ~T_WOULDBLOCK;
498 return (1);
499 }
500 }
501
502 /*
503 * Send out the replayed reply or the 'real' one.
504 */
505 if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)rbp)) {
506 DTRACE_PROBE2(nfss__e__dispatch_sendfail,
507 struct svc_req *, xprt,
508 char *, rbp);
509 svcerr_systemerr(xprt);
510 error++;
511 } else {
512 /*
513 * Update kstats
514 */
515 rfs4_compound_kstat_res(rbp);
516 *rlen = xdr_sizeof(xdr_COMPOUND4res_srv, rbp);
517 }
518
519 /*
520 * If this reply was just inserted into the duplicate cache
521 * or it was replayed from the dup cache; (re)mark it as
522 * available for replay
523 *
524 * At first glance, this 'if' statement seems a little strange;
525 * testing for NFS4_DUP_REPLAY, and then calling...
526 *
527 * rfs4_dr_chatate(NFS4_DUP_REPLAY)
528 *
529 * ... but notice that we are checking dr_stat, and not the
530 * state of the entry itself, the entry will be NFS4_DUP_INUSE,
531 * we do that so that we know not to prematurely reap it whilst
532 * we resent it to the client.
533 *
534 */
535 if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) {
536 mutex_enter(&drp->drc->lock);
537 rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
538 mutex_exit(&drp->drc->lock);
539 } else if (dr_stat == NFS4_NOT_DUP) {
540 rfs4_compound_free(rbp);
541 }
542
543 return (error);
544 }
545
546 bool_t
547 rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args)
548 {
549 COMPOUND4args *argsp;
550 COMPOUND4res res_buf, *resp;
551
552 if (req->rq_vers != 4)
553 return (FALSE);
554
555 argsp = (COMPOUND4args *)args;
556
557 if (argsp->minorversion <= NFS4_MAX_MINOR_VERSION)
558 return (FALSE);
559
560 resp = &res_buf;
561
562 /*
563 * Form a reply tag by copying over the reqeuest tag.
564 */
565 resp->tag.utf8string_val =
566 kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
567 resp->tag.utf8string_len = argsp->tag.utf8string_len;
568 bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val,
569 resp->tag.utf8string_len);
570 resp->array_len = 0;
571 resp->array = NULL;
572 resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
573 if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)resp)) {
574 DTRACE_PROBE2(nfss__e__minorvers_mismatch,
575 SVCXPRT *, xprt, char *, resp);
576 svcerr_systemerr(xprt);
577 }
578 rfs4_compound_free(resp);
579 return (TRUE);
580 }
581
582 void
583 rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp)
584 {
585 COMPOUND4res res_buf, *rbp;
586 nfs_resop4 *resop;
587 PUTFH4res *resp;
588
589 rbp = &res_buf;
590
591 /*
592 * Form a reply tag by copying over the request tag.
593 */
594 rbp->tag.utf8string_val =
595 kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
596 rbp->tag.utf8string_len = argsp->tag.utf8string_len;
597 bcopy(argsp->tag.utf8string_val, rbp->tag.utf8string_val,
598 rbp->tag.utf8string_len);
599
600 rbp->array_len = 1;
601 rbp->array = kmem_zalloc(rbp->array_len * sizeof (nfs_resop4),
602 KM_SLEEP);
603 resop = &rbp->array[0];
604 resop->resop = argsp->array[0].argop; /* copy first op over */
605
606 /* Any op will do, just need to access status field */
607 resp = &resop->nfs_resop4_u.opputfh;
608
609 /*
610 * NFS4ERR_RESOURCE is allowed for all ops, except OP_ILLEGAL.
611 * Note that all op numbers in the compound array were already
612 * validated by the XDR decoder (xdr_COMPOUND4args_srv()).
613 */
614 resp->status = (resop->resop == OP_ILLEGAL ?
615 NFS4ERR_OP_ILLEGAL : NFS4ERR_RESOURCE);
616
617 /* compound status is same as first op status */
618 rbp->status = resp->status;
619
620 if (!svc_sendreply(req->rq_xprt, xdr_COMPOUND4res_srv, (char *)rbp)) {
621 DTRACE_PROBE2(nfss__rsrc_err__sendfail,
622 struct svc_req *, req->rq_xprt, char *, rbp);
623 svcerr_systemerr(req->rq_xprt);
624 }
625
626 UTF8STRING_FREE(rbp->tag);
627 kmem_free(rbp->array, rbp->array_len * sizeof (nfs_resop4));
628 }