Print this page
8634 epoll fails to wake on certain edge-triggered conditions
8635 epoll should not emit POLLNVAL
8636 recursive epoll should emit EPOLLRDNORM
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Toomas Soome <tsoome@me.com>
Reviewed by: Igor Kozhukhov <igor@dilos.org>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/io/eventfd.c
+++ new/usr/src/uts/common/io/eventfd.c
1 1 /*
2 2 * This file and its contents are supplied under the terms of the
|
↓ open down ↓ |
2 lines elided |
↑ open up ↑ |
3 3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 4 * You may only use this file in accordance with the terms of version
5 5 * 1.0 of the CDDL.
6 6 *
7 7 * A full copy of the text of the CDDL should have accompanied this
8 8 * source. A copy of the CDDL is also available via the Internet at
9 9 * http://www.illumos.org/license/CDDL.
10 10 */
11 11
12 12 /*
13 - * Copyright 2016 Joyent, Inc.
13 + * Copyright 2017 Joyent, Inc.
14 14 */
15 15
16 16 /*
17 17 * Support for the eventfd facility, a Linux-borne facility for user-generated
18 18 * file descriptor-based events.
19 19 */
20 20
21 21 #include <sys/ddi.h>
22 22 #include <sys/sunddi.h>
23 23 #include <sys/eventfd.h>
24 24 #include <sys/conf.h>
25 25 #include <sys/vmem.h>
26 26 #include <sys/sysmacros.h>
27 27 #include <sys/filio.h>
28 28 #include <sys/stat.h>
29 29 #include <sys/file.h>
30 30
31 31 struct eventfd_state;
32 32 typedef struct eventfd_state eventfd_state_t;
33 33
34 34 struct eventfd_state {
35 35 kmutex_t efd_lock; /* lock protecting state */
36 36 boolean_t efd_semaphore; /* boolean: sema. semantics */
37 37 kcondvar_t efd_cv; /* condvar */
38 38 pollhead_t efd_pollhd; /* poll head */
39 39 uint64_t efd_value; /* value */
40 40 size_t efd_bwriters; /* count of blocked writers */
41 41 eventfd_state_t *efd_next; /* next state on global list */
42 42 };
43 43
44 44 /*
45 45 * Internal global variables.
46 46 */
47 47 static kmutex_t eventfd_lock; /* lock protecting state */
48 48 static dev_info_t *eventfd_devi; /* device info */
49 49 static vmem_t *eventfd_minor; /* minor number arena */
50 50 static void *eventfd_softstate; /* softstate pointer */
51 51 static eventfd_state_t *eventfd_state; /* global list of state */
52 52
53 53 /*ARGSUSED*/
54 54 static int
55 55 eventfd_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
56 56 {
57 57 eventfd_state_t *state;
58 58 major_t major = getemajor(*devp);
59 59 minor_t minor = getminor(*devp);
60 60
61 61 if (minor != EVENTFDMNRN_EVENTFD)
62 62 return (ENXIO);
63 63
64 64 mutex_enter(&eventfd_lock);
65 65
66 66 minor = (minor_t)(uintptr_t)vmem_alloc(eventfd_minor, 1,
67 67 VM_BESTFIT | VM_SLEEP);
68 68
69 69 if (ddi_soft_state_zalloc(eventfd_softstate, minor) != DDI_SUCCESS) {
70 70 vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1);
71 71 mutex_exit(&eventfd_lock);
72 72 return (NULL);
73 73 }
74 74
75 75 state = ddi_get_soft_state(eventfd_softstate, minor);
76 76 *devp = makedevice(major, minor);
77 77
78 78 state->efd_next = eventfd_state;
79 79 eventfd_state = state;
80 80
81 81 mutex_exit(&eventfd_lock);
82 82
83 83 return (0);
84 84 }
85 85
86 86 /*ARGSUSED*/
87 87 static int
88 88 eventfd_read(dev_t dev, uio_t *uio, cred_t *cr)
89 89 {
90 90 eventfd_state_t *state;
91 91 minor_t minor = getminor(dev);
92 92 uint64_t val, oval;
93 93 int err;
94 94
95 95 if (uio->uio_resid < sizeof (val))
96 96 return (EINVAL);
97 97
98 98 state = ddi_get_soft_state(eventfd_softstate, minor);
99 99
100 100 mutex_enter(&state->efd_lock);
101 101
102 102 while (state->efd_value == 0) {
103 103 if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
104 104 mutex_exit(&state->efd_lock);
105 105 return (EAGAIN);
106 106 }
107 107
108 108 if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) {
109 109 mutex_exit(&state->efd_lock);
110 110 return (EINTR);
111 111 }
112 112 }
113 113
114 114 /*
115 115 * We have a non-zero value and we own the lock; our behavior now
116 116 * depends on whether or not EFD_SEMAPHORE was set when the eventfd
117 117 * was created.
118 118 */
119 119 val = oval = state->efd_value;
120 120
121 121 if (state->efd_semaphore) {
122 122 state->efd_value--;
123 123 val = 1;
124 124 } else {
125 125 state->efd_value = 0;
126 126 }
127 127
128 128 err = uiomove(&val, sizeof (val), UIO_READ, uio);
129 129
130 130 /*
131 131 * Wake any writers blocked on this eventfd as this read operation may
132 132 * have created adequate capacity for their values.
133 133 */
134 134 if (state->efd_bwriters != 0) {
135 135 cv_broadcast(&state->efd_cv);
136 136 }
137 137 mutex_exit(&state->efd_lock);
138 138
139 139 /*
140 140 * It is necessary to emit POLLOUT events only when the eventfd
141 141 * transitions from EVENTFD_VALMAX to a lower value. At all other
142 142 * times, it is already considered writable by poll.
143 143 */
144 144 if (oval == EVENTFD_VALMAX) {
145 145 pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT);
146 146 }
147 147
148 148 return (err);
149 149 }
150 150
151 151 /*ARGSUSED*/
152 152 static int
153 153 eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
154 154 {
155 155 eventfd_state_t *state;
156 156 minor_t minor = getminor(dev);
157 157 uint64_t val, oval;
158 158 int err;
159 159
160 160 if (uio->uio_resid < sizeof (val))
161 161 return (EINVAL);
162 162
163 163 if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0)
164 164 return (err);
165 165
166 166 if (val > EVENTFD_VALMAX)
167 167 return (EINVAL);
168 168
169 169 state = ddi_get_soft_state(eventfd_softstate, minor);
170 170
171 171 mutex_enter(&state->efd_lock);
172 172
173 173 while (val > EVENTFD_VALMAX - state->efd_value) {
174 174 if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
175 175 mutex_exit(&state->efd_lock);
176 176 return (EAGAIN);
177 177 }
178 178
179 179 state->efd_bwriters++;
180 180 if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) {
181 181 state->efd_bwriters--;
182 182 mutex_exit(&state->efd_lock);
183 183 return (EINTR);
184 184 }
185 185 state->efd_bwriters--;
186 186 }
187 187
188 188 /*
189 189 * We now know that we can add the value without overflowing.
190 190 */
191 191 state->efd_value = (oval = state->efd_value) + val;
192 192
193 193 /*
194 194 * If the value was previously "empty", notify blocked readers that
195 195 * data is available.
196 196 */
197 197 if (oval == 0) {
198 198 cv_broadcast(&state->efd_cv);
199 199 }
200 200 mutex_exit(&state->efd_lock);
201 201
202 202 /*
203 203 * Notify pollers as well if the eventfd is now readable.
204 204 */
205 205 if (oval == 0) {
206 206 pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN);
207 207 }
208 208
209 209 return (0);
210 210 }
211 211
212 212 /*ARGSUSED*/
213 213 static int
214 214 eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
215 215 struct pollhead **phpp)
216 216 {
217 217 eventfd_state_t *state;
218 218 minor_t minor = getminor(dev);
219 219 short revents = 0;
220 220
|
↓ open down ↓ |
197 lines elided |
↑ open up ↑ |
221 221 state = ddi_get_soft_state(eventfd_softstate, minor);
222 222
223 223 mutex_enter(&state->efd_lock);
224 224
225 225 if (state->efd_value > 0)
226 226 revents |= POLLRDNORM | POLLIN;
227 227
228 228 if (state->efd_value < EVENTFD_VALMAX)
229 229 revents |= POLLWRNORM | POLLOUT;
230 230
231 - if (!(*reventsp = revents & events) && !anyyet)
231 + *reventsp = revents & events;
232 + if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
232 233 *phpp = &state->efd_pollhd;
234 + }
233 235
234 236 mutex_exit(&state->efd_lock);
235 237
236 238 return (0);
237 239 }
238 240
239 241 /*ARGSUSED*/
240 242 static int
241 243 eventfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
242 244 {
243 245 eventfd_state_t *state;
244 246 minor_t minor = getminor(dev);
245 247
246 248 state = ddi_get_soft_state(eventfd_softstate, minor);
247 249
248 250 switch (cmd) {
249 251 case EVENTFDIOC_SEMAPHORE: {
250 252 mutex_enter(&state->efd_lock);
251 253 state->efd_semaphore ^= 1;
252 254 mutex_exit(&state->efd_lock);
253 255
254 256 return (0);
255 257 }
256 258
257 259 default:
258 260 break;
259 261 }
260 262
261 263 return (ENOTTY);
262 264 }
263 265
264 266 /*ARGSUSED*/
265 267 static int
266 268 eventfd_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
267 269 {
268 270 eventfd_state_t *state, **sp;
269 271 minor_t minor = getminor(dev);
270 272
271 273 state = ddi_get_soft_state(eventfd_softstate, minor);
272 274
273 275 if (state->efd_pollhd.ph_list != NULL) {
274 276 pollwakeup(&state->efd_pollhd, POLLERR);
275 277 pollhead_clean(&state->efd_pollhd);
276 278 }
277 279
278 280 mutex_enter(&eventfd_lock);
279 281
280 282 /*
281 283 * Remove our state from our global list.
282 284 */
283 285 for (sp = &eventfd_state; *sp != state; sp = &((*sp)->efd_next))
284 286 VERIFY(*sp != NULL);
285 287
286 288 *sp = (*sp)->efd_next;
287 289
288 290 ddi_soft_state_free(eventfd_softstate, minor);
289 291 vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1);
290 292
291 293 mutex_exit(&eventfd_lock);
292 294
293 295 return (0);
294 296 }
295 297
296 298 static int
297 299 eventfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
298 300 {
299 301 switch (cmd) {
300 302 case DDI_ATTACH:
301 303 break;
302 304
303 305 case DDI_RESUME:
304 306 return (DDI_SUCCESS);
305 307
306 308 default:
307 309 return (DDI_FAILURE);
308 310 }
309 311
310 312 mutex_enter(&eventfd_lock);
311 313
312 314 if (ddi_soft_state_init(&eventfd_softstate,
313 315 sizeof (eventfd_state_t), 0) != 0) {
314 316 cmn_err(CE_NOTE, "/dev/eventfd failed to create soft state");
315 317 mutex_exit(&eventfd_lock);
316 318 return (DDI_FAILURE);
317 319 }
318 320
319 321 if (ddi_create_minor_node(devi, "eventfd", S_IFCHR,
320 322 EVENTFDMNRN_EVENTFD, DDI_PSEUDO, NULL) == DDI_FAILURE) {
321 323 cmn_err(CE_NOTE, "/dev/eventfd couldn't create minor node");
322 324 ddi_soft_state_fini(&eventfd_softstate);
323 325 mutex_exit(&eventfd_lock);
324 326 return (DDI_FAILURE);
325 327 }
326 328
327 329 ddi_report_dev(devi);
328 330 eventfd_devi = devi;
329 331
330 332 eventfd_minor = vmem_create("eventfd_minor", (void *)EVENTFDMNRN_CLONE,
331 333 UINT32_MAX - EVENTFDMNRN_CLONE, 1, NULL, NULL, NULL, 0,
332 334 VM_SLEEP | VMC_IDENTIFIER);
333 335
334 336 mutex_exit(&eventfd_lock);
335 337
336 338 return (DDI_SUCCESS);
337 339 }
338 340
339 341 /*ARGSUSED*/
340 342 static int
341 343 eventfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
342 344 {
343 345 switch (cmd) {
344 346 case DDI_DETACH:
345 347 break;
346 348
347 349 case DDI_SUSPEND:
348 350 return (DDI_SUCCESS);
349 351
350 352 default:
351 353 return (DDI_FAILURE);
352 354 }
353 355
354 356 mutex_enter(&eventfd_lock);
355 357 vmem_destroy(eventfd_minor);
356 358
357 359 ddi_remove_minor_node(eventfd_devi, NULL);
358 360 eventfd_devi = NULL;
359 361
360 362 ddi_soft_state_fini(&eventfd_softstate);
361 363 mutex_exit(&eventfd_lock);
362 364
363 365 return (DDI_SUCCESS);
364 366 }
365 367
366 368 /*ARGSUSED*/
367 369 static int
368 370 eventfd_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
369 371 {
370 372 int error;
371 373
372 374 switch (infocmd) {
373 375 case DDI_INFO_DEVT2DEVINFO:
374 376 *result = (void *)eventfd_devi;
375 377 error = DDI_SUCCESS;
376 378 break;
377 379 case DDI_INFO_DEVT2INSTANCE:
378 380 *result = (void *)0;
379 381 error = DDI_SUCCESS;
380 382 break;
381 383 default:
382 384 error = DDI_FAILURE;
383 385 }
384 386 return (error);
385 387 }
386 388
387 389 static struct cb_ops eventfd_cb_ops = {
388 390 eventfd_open, /* open */
389 391 eventfd_close, /* close */
390 392 nulldev, /* strategy */
391 393 nulldev, /* print */
392 394 nodev, /* dump */
393 395 eventfd_read, /* read */
394 396 eventfd_write, /* write */
395 397 eventfd_ioctl, /* ioctl */
396 398 nodev, /* devmap */
397 399 nodev, /* mmap */
398 400 nodev, /* segmap */
399 401 eventfd_poll, /* poll */
400 402 ddi_prop_op, /* cb_prop_op */
401 403 0, /* streamtab */
402 404 D_NEW | D_MP /* Driver compatibility flag */
403 405 };
404 406
405 407 static struct dev_ops eventfd_ops = {
406 408 DEVO_REV, /* devo_rev */
407 409 0, /* refcnt */
408 410 eventfd_info, /* get_dev_info */
409 411 nulldev, /* identify */
410 412 nulldev, /* probe */
411 413 eventfd_attach, /* attach */
412 414 eventfd_detach, /* detach */
413 415 nodev, /* reset */
414 416 &eventfd_cb_ops, /* driver operations */
415 417 NULL, /* bus operations */
416 418 nodev, /* dev power */
417 419 ddi_quiesce_not_needed, /* quiesce */
418 420 };
419 421
420 422 static struct modldrv modldrv = {
421 423 &mod_driverops, /* module type (this is a pseudo driver) */
422 424 "eventfd support", /* name of module */
423 425 &eventfd_ops, /* driver ops */
424 426 };
425 427
426 428 static struct modlinkage modlinkage = {
427 429 MODREV_1,
428 430 (void *)&modldrv,
429 431 NULL
430 432 };
431 433
432 434 int
433 435 _init(void)
434 436 {
435 437 return (mod_install(&modlinkage));
436 438 }
437 439
438 440 int
439 441 _info(struct modinfo *modinfop)
440 442 {
441 443 return (mod_info(&modlinkage, modinfop));
442 444 }
443 445
444 446 int
445 447 _fini(void)
446 448 {
447 449 return (mod_remove(&modlinkage));
448 450 }
|
↓ open down ↓ |
206 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX