Print this page
OS-5538 eventfd wrongly blocks writers in semaphore mode
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Bryan Cantrill <bryan@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/io/eventfd.c
+++ new/usr/src/uts/common/io/eventfd.c
1 1 /*
2 2 * This file and its contents are supplied under the terms of the
|
↓ open down ↓ |
2 lines elided |
↑ open up ↑ |
3 3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 4 * You may only use this file in accordance with the terms of version
5 5 * 1.0 of the CDDL.
6 6 *
7 7 * A full copy of the text of the CDDL should have accompanied this
8 8 * source. A copy of the CDDL is also available via the Internet at
9 9 * http://www.illumos.org/license/CDDL.
10 10 */
11 11
12 12 /*
13 - * Copyright (c) 2015 Joyent, Inc. All rights reserved.
13 + * Copyright 2016 Joyent, Inc.
14 14 */
15 15
16 16 /*
17 17 * Support for the eventfd facility, a Linux-borne facility for user-generated
18 18 * file descriptor-based events.
19 19 */
20 20
21 21 #include <sys/ddi.h>
22 22 #include <sys/sunddi.h>
23 23 #include <sys/eventfd.h>
24 24 #include <sys/conf.h>
25 25 #include <sys/vmem.h>
26 26 #include <sys/sysmacros.h>
27 27 #include <sys/filio.h>
28 28 #include <sys/stat.h>
29 29 #include <sys/file.h>
|
↓ open down ↓ |
6 lines elided |
↑ open up ↑ |
30 30
31 31 struct eventfd_state;
32 32 typedef struct eventfd_state eventfd_state_t;
33 33
34 34 struct eventfd_state {
35 35 kmutex_t efd_lock; /* lock protecting state */
36 36 boolean_t efd_semaphore; /* boolean: sema. semantics */
37 37 kcondvar_t efd_cv; /* condvar */
38 38 pollhead_t efd_pollhd; /* poll head */
39 39 uint64_t efd_value; /* value */
40 + size_t efd_bwriters; /* count of blocked writers */
40 41 eventfd_state_t *efd_next; /* next state on global list */
41 42 };
42 43
43 44 /*
44 45 * Internal global variables.
45 46 */
46 47 static kmutex_t eventfd_lock; /* lock protecting state */
47 48 static dev_info_t *eventfd_devi; /* device info */
48 49 static vmem_t *eventfd_minor; /* minor number arena */
49 50 static void *eventfd_softstate; /* softstate pointer */
50 51 static eventfd_state_t *eventfd_state; /* global list of state */
51 52
52 53 /*ARGSUSED*/
53 54 static int
54 55 eventfd_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
55 56 {
56 57 eventfd_state_t *state;
57 58 major_t major = getemajor(*devp);
58 59 minor_t minor = getminor(*devp);
59 60
60 61 if (minor != EVENTFDMNRN_EVENTFD)
61 62 return (ENXIO);
62 63
63 64 mutex_enter(&eventfd_lock);
64 65
65 66 minor = (minor_t)(uintptr_t)vmem_alloc(eventfd_minor, 1,
66 67 VM_BESTFIT | VM_SLEEP);
67 68
68 69 if (ddi_soft_state_zalloc(eventfd_softstate, minor) != DDI_SUCCESS) {
69 70 vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1);
70 71 mutex_exit(&eventfd_lock);
71 72 return (NULL);
72 73 }
73 74
74 75 state = ddi_get_soft_state(eventfd_softstate, minor);
75 76 *devp = makedevice(major, minor);
76 77
77 78 state->efd_next = eventfd_state;
78 79 eventfd_state = state;
79 80
80 81 mutex_exit(&eventfd_lock);
81 82
82 83 return (0);
83 84 }
84 85
85 86 /*ARGSUSED*/
86 87 static int
87 88 eventfd_read(dev_t dev, uio_t *uio, cred_t *cr)
88 89 {
89 90 eventfd_state_t *state;
90 91 minor_t minor = getminor(dev);
91 92 uint64_t val, oval;
92 93 int err;
93 94
94 95 if (uio->uio_resid < sizeof (val))
95 96 return (EINVAL);
96 97
97 98 state = ddi_get_soft_state(eventfd_softstate, minor);
98 99
99 100 mutex_enter(&state->efd_lock);
100 101
101 102 while (state->efd_value == 0) {
102 103 if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
103 104 mutex_exit(&state->efd_lock);
104 105 return (EAGAIN);
105 106 }
106 107
107 108 if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) {
108 109 mutex_exit(&state->efd_lock);
109 110 return (EINTR);
110 111 }
111 112 }
112 113
113 114 /*
114 115 * We have a non-zero value and we own the lock; our behavior now
115 116 * depends on whether or not EFD_SEMAPHORE was set when the eventfd
116 117 * was created.
117 118 */
118 119 val = oval = state->efd_value;
|
↓ open down ↓ |
69 lines elided |
↑ open up ↑ |
119 120
120 121 if (state->efd_semaphore) {
121 122 state->efd_value--;
122 123 val = 1;
123 124 } else {
124 125 state->efd_value = 0;
125 126 }
126 127
127 128 err = uiomove(&val, sizeof (val), UIO_READ, uio);
128 129
130 + /*
131 + * Wake any writers blocked on this eventfd as this read operation may
132 + * have created adequate capacity for their values.
133 + */
134 + if (state->efd_bwriters != 0) {
135 + cv_broadcast(&state->efd_cv);
136 + }
129 137 mutex_exit(&state->efd_lock);
130 138
139 + /*
140 + * It is necessary to emit POLLOUT events only when the eventfd
141 + * transitions from EVENTFD_VALMAX to a lower value. At all other
142 + * times, it is already considered writable by poll.
143 + */
131 144 if (oval == EVENTFD_VALMAX) {
132 - cv_broadcast(&state->efd_cv);
133 145 pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT);
134 146 }
135 147
136 148 return (err);
137 149 }
138 150
139 151 /*ARGSUSED*/
140 152 static int
141 153 eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
142 154 {
143 155 eventfd_state_t *state;
144 156 minor_t minor = getminor(dev);
145 157 uint64_t val, oval;
146 158 int err;
147 159
148 160 if (uio->uio_resid < sizeof (val))
149 161 return (EINVAL);
150 162
151 163 if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0)
152 164 return (err);
153 165
154 166 if (val > EVENTFD_VALMAX)
155 167 return (EINVAL);
156 168
|
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
157 169 state = ddi_get_soft_state(eventfd_softstate, minor);
158 170
159 171 mutex_enter(&state->efd_lock);
160 172
161 173 while (val > EVENTFD_VALMAX - state->efd_value) {
162 174 if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
163 175 mutex_exit(&state->efd_lock);
164 176 return (EAGAIN);
165 177 }
166 178
179 + state->efd_bwriters++;
167 180 if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) {
181 + state->efd_bwriters--;
168 182 mutex_exit(&state->efd_lock);
169 183 return (EINTR);
170 184 }
185 + state->efd_bwriters--;
171 186 }
172 187
173 188 /*
174 189 * We now know that we can add the value without overflowing.
175 190 */
176 191 state->efd_value = (oval = state->efd_value) + val;
177 192
193 + /*
194 + * If the value was previously "empty", notify blocked readers that
195 + * data is available.
196 + */
197 + if (oval == 0) {
198 + cv_broadcast(&state->efd_cv);
199 + }
178 200 mutex_exit(&state->efd_lock);
179 201
202 + /*
203 + * Notify pollers as well if the eventfd is now readable.
204 + */
180 205 if (oval == 0) {
181 - cv_broadcast(&state->efd_cv);
182 206 pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN);
183 207 }
184 208
185 209 return (0);
186 210 }
187 211
188 212 /*ARGSUSED*/
189 213 static int
190 214 eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
191 215 struct pollhead **phpp)
192 216 {
193 217 eventfd_state_t *state;
194 218 minor_t minor = getminor(dev);
195 219 short revents = 0;
196 220
197 221 state = ddi_get_soft_state(eventfd_softstate, minor);
198 222
199 223 mutex_enter(&state->efd_lock);
200 224
201 225 if (state->efd_value > 0)
202 226 revents |= POLLRDNORM | POLLIN;
203 227
204 228 if (state->efd_value < EVENTFD_VALMAX)
205 229 revents |= POLLWRNORM | POLLOUT;
206 230
207 231 if (!(*reventsp = revents & events) && !anyyet)
208 232 *phpp = &state->efd_pollhd;
209 233
210 234 mutex_exit(&state->efd_lock);
211 235
212 236 return (0);
213 237 }
214 238
215 239 /*ARGSUSED*/
216 240 static int
217 241 eventfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
218 242 {
219 243 eventfd_state_t *state;
220 244 minor_t minor = getminor(dev);
221 245
222 246 state = ddi_get_soft_state(eventfd_softstate, minor);
223 247
224 248 switch (cmd) {
225 249 case EVENTFDIOC_SEMAPHORE: {
226 250 mutex_enter(&state->efd_lock);
227 251 state->efd_semaphore ^= 1;
228 252 mutex_exit(&state->efd_lock);
229 253
230 254 return (0);
231 255 }
232 256
233 257 default:
234 258 break;
235 259 }
236 260
237 261 return (ENOTTY);
238 262 }
239 263
240 264 /*ARGSUSED*/
241 265 static int
242 266 eventfd_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
243 267 {
244 268 eventfd_state_t *state, **sp;
245 269 minor_t minor = getminor(dev);
246 270
247 271 state = ddi_get_soft_state(eventfd_softstate, minor);
248 272
249 273 if (state->efd_pollhd.ph_list != NULL) {
250 274 pollwakeup(&state->efd_pollhd, POLLERR);
251 275 pollhead_clean(&state->efd_pollhd);
252 276 }
253 277
254 278 mutex_enter(&eventfd_lock);
255 279
256 280 /*
257 281 * Remove our state from our global list.
258 282 */
259 283 for (sp = &eventfd_state; *sp != state; sp = &((*sp)->efd_next))
260 284 VERIFY(*sp != NULL);
261 285
262 286 *sp = (*sp)->efd_next;
263 287
264 288 ddi_soft_state_free(eventfd_softstate, minor);
265 289 vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1);
266 290
267 291 mutex_exit(&eventfd_lock);
268 292
269 293 return (0);
270 294 }
271 295
272 296 static int
273 297 eventfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
274 298 {
275 299 switch (cmd) {
276 300 case DDI_ATTACH:
277 301 break;
278 302
279 303 case DDI_RESUME:
280 304 return (DDI_SUCCESS);
281 305
282 306 default:
283 307 return (DDI_FAILURE);
284 308 }
285 309
286 310 mutex_enter(&eventfd_lock);
287 311
288 312 if (ddi_soft_state_init(&eventfd_softstate,
289 313 sizeof (eventfd_state_t), 0) != 0) {
290 314 cmn_err(CE_NOTE, "/dev/eventfd failed to create soft state");
291 315 mutex_exit(&eventfd_lock);
292 316 return (DDI_FAILURE);
293 317 }
294 318
295 319 if (ddi_create_minor_node(devi, "eventfd", S_IFCHR,
296 320 EVENTFDMNRN_EVENTFD, DDI_PSEUDO, NULL) == DDI_FAILURE) {
297 321 cmn_err(CE_NOTE, "/dev/eventfd couldn't create minor node");
298 322 ddi_soft_state_fini(&eventfd_softstate);
299 323 mutex_exit(&eventfd_lock);
300 324 return (DDI_FAILURE);
301 325 }
302 326
303 327 ddi_report_dev(devi);
304 328 eventfd_devi = devi;
305 329
306 330 eventfd_minor = vmem_create("eventfd_minor", (void *)EVENTFDMNRN_CLONE,
307 331 UINT32_MAX - EVENTFDMNRN_CLONE, 1, NULL, NULL, NULL, 0,
308 332 VM_SLEEP | VMC_IDENTIFIER);
309 333
310 334 mutex_exit(&eventfd_lock);
311 335
312 336 return (DDI_SUCCESS);
313 337 }
314 338
315 339 /*ARGSUSED*/
316 340 static int
317 341 eventfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
318 342 {
319 343 switch (cmd) {
320 344 case DDI_DETACH:
321 345 break;
322 346
323 347 case DDI_SUSPEND:
324 348 return (DDI_SUCCESS);
325 349
326 350 default:
327 351 return (DDI_FAILURE);
328 352 }
329 353
330 354 mutex_enter(&eventfd_lock);
331 355 vmem_destroy(eventfd_minor);
332 356
333 357 ddi_remove_minor_node(eventfd_devi, NULL);
334 358 eventfd_devi = NULL;
335 359
336 360 ddi_soft_state_fini(&eventfd_softstate);
337 361 mutex_exit(&eventfd_lock);
338 362
339 363 return (DDI_SUCCESS);
340 364 }
341 365
342 366 /*ARGSUSED*/
343 367 static int
344 368 eventfd_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
345 369 {
346 370 int error;
347 371
348 372 switch (infocmd) {
349 373 case DDI_INFO_DEVT2DEVINFO:
350 374 *result = (void *)eventfd_devi;
351 375 error = DDI_SUCCESS;
352 376 break;
353 377 case DDI_INFO_DEVT2INSTANCE:
354 378 *result = (void *)0;
355 379 error = DDI_SUCCESS;
356 380 break;
357 381 default:
358 382 error = DDI_FAILURE;
359 383 }
360 384 return (error);
361 385 }
362 386
363 387 static struct cb_ops eventfd_cb_ops = {
364 388 eventfd_open, /* open */
365 389 eventfd_close, /* close */
366 390 nulldev, /* strategy */
367 391 nulldev, /* print */
368 392 nodev, /* dump */
369 393 eventfd_read, /* read */
370 394 eventfd_write, /* write */
371 395 eventfd_ioctl, /* ioctl */
372 396 nodev, /* devmap */
373 397 nodev, /* mmap */
374 398 nodev, /* segmap */
375 399 eventfd_poll, /* poll */
376 400 ddi_prop_op, /* cb_prop_op */
377 401 0, /* streamtab */
378 402 D_NEW | D_MP /* Driver compatibility flag */
379 403 };
380 404
381 405 static struct dev_ops eventfd_ops = {
382 406 DEVO_REV, /* devo_rev */
383 407 0, /* refcnt */
384 408 eventfd_info, /* get_dev_info */
385 409 nulldev, /* identify */
386 410 nulldev, /* probe */
387 411 eventfd_attach, /* attach */
388 412 eventfd_detach, /* detach */
389 413 nodev, /* reset */
390 414 &eventfd_cb_ops, /* driver operations */
391 415 NULL, /* bus operations */
392 416 nodev, /* dev power */
393 417 ddi_quiesce_not_needed, /* quiesce */
394 418 };
395 419
396 420 static struct modldrv modldrv = {
397 421 &mod_driverops, /* module type (this is a pseudo driver) */
398 422 "eventfd support", /* name of module */
399 423 &eventfd_ops, /* driver ops */
400 424 };
401 425
402 426 static struct modlinkage modlinkage = {
403 427 MODREV_1,
404 428 (void *)&modldrv,
405 429 NULL
406 430 };
407 431
408 432 int
409 433 _init(void)
410 434 {
411 435 return (mod_install(&modlinkage));
412 436 }
413 437
414 438 int
415 439 _info(struct modinfo *modinfop)
416 440 {
417 441 return (mod_info(&modlinkage, modinfop));
418 442 }
419 443
420 444 int
421 445 _fini(void)
422 446 {
423 447 return (mod_remove(&modlinkage));
424 448 }
|
↓ open down ↓ |
233 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX