]> granicus.if.org Git - libevent/blob - wepoll.c
Optimize arc4random_uniform() (by syncing with OpenBSD implementation)
[libevent] / wepoll.c
1 /*
2  * wepoll - epoll for Windows
3  * https://github.com/piscisaureus/wepoll
4  *
5  * Copyright 2012-2020, Bert Belder <bertbelder@gmail.com>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are
10  * met:
11  *
12  *   * Redistributions of source code must retain the above copyright
13  *     notice, this list of conditions and the following disclaimer.
14  *
15  *   * Redistributions in binary form must reproduce the above copyright
16  *     notice, this list of conditions and the following disclaimer in the
17  *     documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31
32 #define WEPOLL_EXPORT
33
34 #include <stdint.h>
35
36 enum EPOLL_EVENTS {
37   EPOLLIN      = (int) (1U <<  0),
38   EPOLLPRI     = (int) (1U <<  1),
39   EPOLLOUT     = (int) (1U <<  2),
40   EPOLLERR     = (int) (1U <<  3),
41   EPOLLHUP     = (int) (1U <<  4),
42   EPOLLRDNORM  = (int) (1U <<  6),
43   EPOLLRDBAND  = (int) (1U <<  7),
44   EPOLLWRNORM  = (int) (1U <<  8),
45   EPOLLWRBAND  = (int) (1U <<  9),
46   EPOLLMSG     = (int) (1U << 10), /* Never reported. */
47   EPOLLRDHUP   = (int) (1U << 13),
48   EPOLLONESHOT = (int) (1U << 31)
49 };
50
51 #define EPOLLIN      (1U <<  0)
52 #define EPOLLPRI     (1U <<  1)
53 #define EPOLLOUT     (1U <<  2)
54 #define EPOLLERR     (1U <<  3)
55 #define EPOLLHUP     (1U <<  4)
56 #define EPOLLRDNORM  (1U <<  6)
57 #define EPOLLRDBAND  (1U <<  7)
58 #define EPOLLWRNORM  (1U <<  8)
59 #define EPOLLWRBAND  (1U <<  9)
60 #define EPOLLMSG     (1U << 10)
61 #define EPOLLRDHUP   (1U << 13)
62 #define EPOLLONESHOT (1U << 31)
63
64 #define EPOLL_CTL_ADD 1
65 #define EPOLL_CTL_MOD 2
66 #define EPOLL_CTL_DEL 3
67
68 typedef void* HANDLE;
69 typedef uintptr_t SOCKET;
70
71 typedef union epoll_data {
72   void* ptr;
73   int fd;
74   uint32_t u32;
75   uint64_t u64;
76   SOCKET sock; /* Windows specific */
77   HANDLE hnd;  /* Windows specific */
78 } epoll_data_t;
79
80 struct epoll_event {
81   uint32_t events;   /* Epoll events and flags */
82   epoll_data_t data; /* User data variable */
83 };
84
85 #ifdef __cplusplus
86 extern "C" {
87 #endif
88
89 WEPOLL_EXPORT HANDLE epoll_create(int size);
90 WEPOLL_EXPORT HANDLE epoll_create1(int flags);
91
92 WEPOLL_EXPORT int epoll_close(HANDLE ephnd);
93
94 WEPOLL_EXPORT int epoll_ctl(HANDLE ephnd,
95                             int op,
96                             SOCKET sock,
97                             struct epoll_event* event);
98
99 WEPOLL_EXPORT int epoll_wait(HANDLE ephnd,
100                              struct epoll_event* events,
101                              int maxevents,
102                              int timeout);
103
104 #ifdef __cplusplus
105 } /* extern "C" */
106 #endif
107
108 #include <assert.h>
109
110 #include <stdlib.h>
111
112 #define WEPOLL_INTERNAL static
113 #define WEPOLL_INTERNAL_VAR static
114
115 #ifndef WIN32_LEAN_AND_MEAN
116 #define WIN32_LEAN_AND_MEAN
117 #endif
118
119 #ifdef __clang__
120 #pragma clang diagnostic push
121 #pragma clang diagnostic ignored "-Wreserved-id-macro"
122 #endif
123
124 #ifdef _WIN32_WINNT
125 #undef _WIN32_WINNT
126 #endif
127
128 #define _WIN32_WINNT 0x0600
129
130 #ifdef __clang__
131 #pragma clang diagnostic pop
132 #endif
133
134 #ifndef __GNUC__
135 #pragma warning(push, 1)
136 #endif
137
138 #include <ws2tcpip.h>
139 #include <winsock2.h>
140 #include <windows.h>
141
142 #ifndef __GNUC__
143 #pragma warning(pop)
144 #endif
145
146 WEPOLL_INTERNAL int nt_global_init(void);
147
148 typedef LONG NTSTATUS;
149 typedef NTSTATUS* PNTSTATUS;
150
151 #ifndef NT_SUCCESS
152 #define NT_SUCCESS(status) (((NTSTATUS)(status)) >= 0)
153 #endif
154
155 #ifndef STATUS_SUCCESS
156 #define STATUS_SUCCESS ((NTSTATUS) 0x00000000L)
157 #endif
158
159 #ifndef STATUS_PENDING
160 #define STATUS_PENDING ((NTSTATUS) 0x00000103L)
161 #endif
162
163 #ifndef STATUS_CANCELLED
164 #define STATUS_CANCELLED ((NTSTATUS) 0xC0000120L)
165 #endif
166
167 #ifndef STATUS_NOT_FOUND
168 #define STATUS_NOT_FOUND ((NTSTATUS) 0xC0000225L)
169 #endif
170
171 typedef struct _IO_STATUS_BLOCK {
172   NTSTATUS Status;
173   ULONG_PTR Information;
174 } IO_STATUS_BLOCK, *PIO_STATUS_BLOCK;
175
176 typedef VOID(NTAPI* PIO_APC_ROUTINE)(PVOID ApcContext,
177                                      PIO_STATUS_BLOCK IoStatusBlock,
178                                      ULONG Reserved);
179
180 typedef struct _UNICODE_STRING {
181   USHORT Length;
182   USHORT MaximumLength;
183   PWSTR Buffer;
184 } UNICODE_STRING, *PUNICODE_STRING;
185
186 #define RTL_CONSTANT_STRING(s) \
187   { sizeof(s) - sizeof((s)[0]), sizeof(s), s }
188
189 typedef struct _OBJECT_ATTRIBUTES {
190   ULONG Length;
191   HANDLE RootDirectory;
192   PUNICODE_STRING ObjectName;
193   ULONG Attributes;
194   PVOID SecurityDescriptor;
195   PVOID SecurityQualityOfService;
196 } OBJECT_ATTRIBUTES, *POBJECT_ATTRIBUTES;
197
198 #define RTL_CONSTANT_OBJECT_ATTRIBUTES(ObjectName, Attributes) \
199   { sizeof(OBJECT_ATTRIBUTES), NULL, ObjectName, Attributes, NULL, NULL }
200
201 #ifndef FILE_OPEN
202 #define FILE_OPEN 0x00000001UL
203 #endif
204
205 #define KEYEDEVENT_WAIT 0x00000001UL
206 #define KEYEDEVENT_WAKE 0x00000002UL
207 #define KEYEDEVENT_ALL_ACCESS \
208   (STANDARD_RIGHTS_REQUIRED | KEYEDEVENT_WAIT | KEYEDEVENT_WAKE)
209
210 #define NT_NTDLL_IMPORT_LIST(X)           \
211   X(NTSTATUS,                             \
212     NTAPI,                                \
213     NtCancelIoFileEx,                     \
214     (HANDLE FileHandle,                   \
215      PIO_STATUS_BLOCK IoRequestToCancel,  \
216      PIO_STATUS_BLOCK IoStatusBlock))     \
217                                           \
218   X(NTSTATUS,                             \
219     NTAPI,                                \
220     NtCreateFile,                         \
221     (PHANDLE FileHandle,                  \
222      ACCESS_MASK DesiredAccess,           \
223      POBJECT_ATTRIBUTES ObjectAttributes, \
224      PIO_STATUS_BLOCK IoStatusBlock,      \
225      PLARGE_INTEGER AllocationSize,       \
226      ULONG FileAttributes,                \
227      ULONG ShareAccess,                   \
228      ULONG CreateDisposition,             \
229      ULONG CreateOptions,                 \
230      PVOID EaBuffer,                      \
231      ULONG EaLength))                     \
232                                           \
233   X(NTSTATUS,                             \
234     NTAPI,                                \
235     NtCreateKeyedEvent,                   \
236     (PHANDLE KeyedEventHandle,            \
237      ACCESS_MASK DesiredAccess,           \
238      POBJECT_ATTRIBUTES ObjectAttributes, \
239      ULONG Flags))                        \
240                                           \
241   X(NTSTATUS,                             \
242     NTAPI,                                \
243     NtDeviceIoControlFile,                \
244     (HANDLE FileHandle,                   \
245      HANDLE Event,                        \
246      PIO_APC_ROUTINE ApcRoutine,          \
247      PVOID ApcContext,                    \
248      PIO_STATUS_BLOCK IoStatusBlock,      \
249      ULONG IoControlCode,                 \
250      PVOID InputBuffer,                   \
251      ULONG InputBufferLength,             \
252      PVOID OutputBuffer,                  \
253      ULONG OutputBufferLength))           \
254                                           \
255   X(NTSTATUS,                             \
256     NTAPI,                                \
257     NtReleaseKeyedEvent,                  \
258     (HANDLE KeyedEventHandle,             \
259      PVOID KeyValue,                      \
260      BOOLEAN Alertable,                   \
261      PLARGE_INTEGER Timeout))             \
262                                           \
263   X(NTSTATUS,                             \
264     NTAPI,                                \
265     NtWaitForKeyedEvent,                  \
266     (HANDLE KeyedEventHandle,             \
267      PVOID KeyValue,                      \
268      BOOLEAN Alertable,                   \
269      PLARGE_INTEGER Timeout))             \
270                                           \
271   X(ULONG, WINAPI, RtlNtStatusToDosError, (NTSTATUS Status))
272
273 #define X(return_type, attributes, name, parameters) \
274   WEPOLL_INTERNAL_VAR return_type(attributes* name) parameters;
275 NT_NTDLL_IMPORT_LIST(X)
276 #undef X
277
278 #define AFD_POLL_RECEIVE           0x0001
279 #define AFD_POLL_RECEIVE_EXPEDITED 0x0002
280 #define AFD_POLL_SEND              0x0004
281 #define AFD_POLL_DISCONNECT        0x0008
282 #define AFD_POLL_ABORT             0x0010
283 #define AFD_POLL_LOCAL_CLOSE       0x0020
284 #define AFD_POLL_ACCEPT            0x0080
285 #define AFD_POLL_CONNECT_FAIL      0x0100
286
287 typedef struct _AFD_POLL_HANDLE_INFO {
288   HANDLE Handle;
289   ULONG Events;
290   NTSTATUS Status;
291 } AFD_POLL_HANDLE_INFO, *PAFD_POLL_HANDLE_INFO;
292
293 typedef struct _AFD_POLL_INFO {
294   LARGE_INTEGER Timeout;
295   ULONG NumberOfHandles;
296   ULONG Exclusive;
297   AFD_POLL_HANDLE_INFO Handles[1];
298 } AFD_POLL_INFO, *PAFD_POLL_INFO;
299
300 WEPOLL_INTERNAL int afd_create_helper_handle(HANDLE iocp_handle,
301                                              HANDLE* afd_helper_handle_out);
302
303 WEPOLL_INTERNAL int afd_poll(HANDLE afd_helper_handle,
304                              AFD_POLL_INFO* poll_info,
305                              IO_STATUS_BLOCK* io_status_block);
306 WEPOLL_INTERNAL int afd_cancel_poll(HANDLE afd_helper_handle,
307                                     IO_STATUS_BLOCK* io_status_block);
308
309 #define return_map_error(value) \
310   do {                          \
311     err_map_win_error();        \
312     return (value);             \
313   } while (0)
314
315 #define return_set_error(value, error) \
316   do {                                 \
317     err_set_win_error(error);          \
318     return (value);                    \
319   } while (0)
320
321 WEPOLL_INTERNAL void err_map_win_error(void);
322 WEPOLL_INTERNAL void err_set_win_error(DWORD error);
323 WEPOLL_INTERNAL int err_check_handle(HANDLE handle);
324
325 #define IOCTL_AFD_POLL 0x00012024
326
327 static UNICODE_STRING afd__helper_name =
328     RTL_CONSTANT_STRING(L"\\Device\\Afd\\Wepoll");
329
330 static OBJECT_ATTRIBUTES afd__helper_attributes =
331     RTL_CONSTANT_OBJECT_ATTRIBUTES(&afd__helper_name, 0);
332
333 int afd_create_helper_handle(HANDLE iocp_handle,
334                              HANDLE* afd_helper_handle_out) {
335   HANDLE afd_helper_handle;
336   IO_STATUS_BLOCK iosb;
337   NTSTATUS status;
338
339   /* By opening \Device\Afd without specifying any extended attributes, we'll
340    * get a handle that lets us talk to the AFD driver, but that doesn't have an
341    * associated endpoint (so it's not a socket). */
342   status = NtCreateFile(&afd_helper_handle,
343                         SYNCHRONIZE,
344                         &afd__helper_attributes,
345                         &iosb,
346                         NULL,
347                         0,
348                         FILE_SHARE_READ | FILE_SHARE_WRITE,
349                         FILE_OPEN,
350                         0,
351                         NULL,
352                         0);
353   if (status != STATUS_SUCCESS)
354     return_set_error(-1, RtlNtStatusToDosError(status));
355
356   if (CreateIoCompletionPort(afd_helper_handle, iocp_handle, 0, 0) == NULL)
357     goto error;
358
359   if (!SetFileCompletionNotificationModes(afd_helper_handle,
360                                           FILE_SKIP_SET_EVENT_ON_HANDLE))
361     goto error;
362
363   *afd_helper_handle_out = afd_helper_handle;
364   return 0;
365
366 error:
367   CloseHandle(afd_helper_handle);
368   return_map_error(-1);
369 }
370
371 int afd_poll(HANDLE afd_helper_handle,
372              AFD_POLL_INFO* poll_info,
373              IO_STATUS_BLOCK* io_status_block) {
374   NTSTATUS status;
375
376   /* Blocking operation is not supported. */
377   assert(io_status_block != NULL);
378
379   io_status_block->Status = STATUS_PENDING;
380   status = NtDeviceIoControlFile(afd_helper_handle,
381                                  NULL,
382                                  NULL,
383                                  io_status_block,
384                                  io_status_block,
385                                  IOCTL_AFD_POLL,
386                                  poll_info,
387                                  sizeof *poll_info,
388                                  poll_info,
389                                  sizeof *poll_info);
390
391   if (status == STATUS_SUCCESS)
392     return 0;
393   else if (status == STATUS_PENDING)
394     return_set_error(-1, ERROR_IO_PENDING);
395   else
396     return_set_error(-1, RtlNtStatusToDosError(status));
397 }
398
399 int afd_cancel_poll(HANDLE afd_helper_handle,
400                     IO_STATUS_BLOCK* io_status_block) {
401   NTSTATUS cancel_status;
402   IO_STATUS_BLOCK cancel_iosb;
403
404   /* If the poll operation has already completed or has been cancelled earlier,
405    * there's nothing left for us to do. */
406   if (io_status_block->Status != STATUS_PENDING)
407     return 0;
408
409   cancel_status =
410       NtCancelIoFileEx(afd_helper_handle, io_status_block, &cancel_iosb);
411
412   /* NtCancelIoFileEx() may return STATUS_NOT_FOUND if the operation completed
413    * just before calling NtCancelIoFileEx(). This is not an error. */
414   if (cancel_status == STATUS_SUCCESS || cancel_status == STATUS_NOT_FOUND)
415     return 0;
416   else
417     return_set_error(-1, RtlNtStatusToDosError(cancel_status));
418 }
419
420 WEPOLL_INTERNAL int epoll_global_init(void);
421
422 WEPOLL_INTERNAL int init(void);
423
424 typedef struct port_state port_state_t;
425 typedef struct queue queue_t;
426 typedef struct sock_state sock_state_t;
427 typedef struct ts_tree_node ts_tree_node_t;
428
429 WEPOLL_INTERNAL port_state_t* port_new(HANDLE* iocp_handle_out);
430 WEPOLL_INTERNAL int port_close(port_state_t* port_state);
431 WEPOLL_INTERNAL int port_delete(port_state_t* port_state);
432
433 WEPOLL_INTERNAL int port_wait(port_state_t* port_state,
434                               struct epoll_event* events,
435                               int maxevents,
436                               int timeout);
437
438 WEPOLL_INTERNAL int port_ctl(port_state_t* port_state,
439                              int op,
440                              SOCKET sock,
441                              struct epoll_event* ev);
442
443 WEPOLL_INTERNAL int port_register_socket_handle(port_state_t* port_state,
444                                                 sock_state_t* sock_state,
445                                                 SOCKET socket);
446 WEPOLL_INTERNAL void port_unregister_socket_handle(port_state_t* port_state,
447                                                    sock_state_t* sock_state);
448 WEPOLL_INTERNAL sock_state_t* port_find_socket(port_state_t* port_state,
449                                                SOCKET socket);
450
451 WEPOLL_INTERNAL void port_request_socket_update(port_state_t* port_state,
452                                                 sock_state_t* sock_state);
453 WEPOLL_INTERNAL void port_cancel_socket_update(port_state_t* port_state,
454                                                sock_state_t* sock_state);
455
456 WEPOLL_INTERNAL void port_add_deleted_socket(port_state_t* port_state,
457                                              sock_state_t* sock_state);
458 WEPOLL_INTERNAL void port_remove_deleted_socket(port_state_t* port_state,
459                                                 sock_state_t* sock_state);
460
461 WEPOLL_INTERNAL HANDLE port_get_iocp_handle(port_state_t* port_state);
462 WEPOLL_INTERNAL queue_t* port_get_poll_group_queue(port_state_t* port_state);
463
464 WEPOLL_INTERNAL port_state_t* port_state_from_handle_tree_node(
465     ts_tree_node_t* tree_node);
466 WEPOLL_INTERNAL ts_tree_node_t* port_state_to_handle_tree_node(
467     port_state_t* port_state);
468
469 /* The reflock is a special kind of lock that normally prevents a chunk of
470  * memory from being freed, but does allow the chunk of memory to eventually be
471  * released in a coordinated fashion.
472  *
473  * Under normal operation, threads increase and decrease the reference count,
474  * which are wait-free operations.
475  *
476  * Exactly once during the reflock's lifecycle, a thread holding a reference to
477  * the lock may "destroy" the lock; this operation blocks until all other
478  * threads holding a reference to the lock have dereferenced it. After
479  * "destroy" returns, the calling thread may assume that no other threads have
480  * a reference to the lock.
481  *
482  * Attemmpting to lock or destroy a lock after reflock_unref_and_destroy() has
483  * been called is invalid and results in undefined behavior. Therefore the user
484  * should use another lock to guarantee that this can't happen.
485  */
486
487 typedef struct reflock {
488   volatile long state; /* 32-bit Interlocked APIs operate on `long` values. */
489 } reflock_t;
490
491 WEPOLL_INTERNAL int reflock_global_init(void);
492
493 WEPOLL_INTERNAL void reflock_init(reflock_t* reflock);
494 WEPOLL_INTERNAL void reflock_ref(reflock_t* reflock);
495 WEPOLL_INTERNAL void reflock_unref(reflock_t* reflock);
496 WEPOLL_INTERNAL void reflock_unref_and_destroy(reflock_t* reflock);
497
498 #include <stdbool.h>
499
500 /* N.b.: the tree functions do not set errno or LastError when they fail. Each
501  * of the API functions has at most one failure mode. It is up to the caller to
502  * set an appropriate error code when necessary. */
503
504 typedef struct tree tree_t;
505 typedef struct tree_node tree_node_t;
506
507 typedef struct tree {
508   tree_node_t* root;
509 } tree_t;
510
511 typedef struct tree_node {
512   tree_node_t* left;
513   tree_node_t* right;
514   tree_node_t* parent;
515   uintptr_t key;
516   bool red;
517 } tree_node_t;
518
519 WEPOLL_INTERNAL void tree_init(tree_t* tree);
520 WEPOLL_INTERNAL void tree_node_init(tree_node_t* node);
521
522 WEPOLL_INTERNAL int tree_add(tree_t* tree, tree_node_t* node, uintptr_t key);
523 WEPOLL_INTERNAL void tree_del(tree_t* tree, tree_node_t* node);
524
525 WEPOLL_INTERNAL tree_node_t* tree_find(const tree_t* tree, uintptr_t key);
526 WEPOLL_INTERNAL tree_node_t* tree_root(const tree_t* tree);
527
528 typedef struct ts_tree {
529   tree_t tree;
530   SRWLOCK lock;
531 } ts_tree_t;
532
533 typedef struct ts_tree_node {
534   tree_node_t tree_node;
535   reflock_t reflock;
536 } ts_tree_node_t;
537
538 WEPOLL_INTERNAL void ts_tree_init(ts_tree_t* rtl);
539 WEPOLL_INTERNAL void ts_tree_node_init(ts_tree_node_t* node);
540
541 WEPOLL_INTERNAL int ts_tree_add(ts_tree_t* ts_tree,
542                                 ts_tree_node_t* node,
543                                 uintptr_t key);
544
545 WEPOLL_INTERNAL ts_tree_node_t* ts_tree_del_and_ref(ts_tree_t* ts_tree,
546                                                     uintptr_t key);
547 WEPOLL_INTERNAL ts_tree_node_t* ts_tree_find_and_ref(ts_tree_t* ts_tree,
548                                                      uintptr_t key);
549
550 WEPOLL_INTERNAL void ts_tree_node_unref(ts_tree_node_t* node);
551 WEPOLL_INTERNAL void ts_tree_node_unref_and_destroy(ts_tree_node_t* node);
552
553 static ts_tree_t epoll__handle_tree;
554
555 int epoll_global_init(void) {
556   ts_tree_init(&epoll__handle_tree);
557   return 0;
558 }
559
560 static HANDLE epoll__create(void) {
561   port_state_t* port_state;
562   HANDLE ephnd;
563   ts_tree_node_t* tree_node;
564
565   if (init() < 0)
566     return NULL;
567
568   port_state = port_new(&ephnd);
569   if (port_state == NULL)
570     return NULL;
571
572   tree_node = port_state_to_handle_tree_node(port_state);
573   if (ts_tree_add(&epoll__handle_tree, tree_node, (uintptr_t) ephnd) < 0) {
574     /* This should never happen. */
575     port_delete(port_state);
576     return_set_error(NULL, ERROR_ALREADY_EXISTS);
577   }
578
579   return ephnd;
580 }
581
582 HANDLE epoll_create(int size) {
583   if (size <= 0)
584     return_set_error(NULL, ERROR_INVALID_PARAMETER);
585
586   return epoll__create();
587 }
588
589 HANDLE epoll_create1(int flags) {
590   if (flags != 0)
591     return_set_error(NULL, ERROR_INVALID_PARAMETER);
592
593   return epoll__create();
594 }
595
596 int epoll_close(HANDLE ephnd) {
597   ts_tree_node_t* tree_node;
598   port_state_t* port_state;
599
600   if (init() < 0)
601     return -1;
602
603   tree_node = ts_tree_del_and_ref(&epoll__handle_tree, (uintptr_t) ephnd);
604   if (tree_node == NULL) {
605     err_set_win_error(ERROR_INVALID_PARAMETER);
606     goto err;
607   }
608
609   port_state = port_state_from_handle_tree_node(tree_node);
610   port_close(port_state);
611
612   ts_tree_node_unref_and_destroy(tree_node);
613
614   return port_delete(port_state);
615
616 err:
617   err_check_handle(ephnd);
618   return -1;
619 }
620
621 int epoll_ctl(HANDLE ephnd, int op, SOCKET sock, struct epoll_event* ev) {
622   ts_tree_node_t* tree_node;
623   port_state_t* port_state;
624   int r;
625
626   if (init() < 0)
627     return -1;
628
629   tree_node = ts_tree_find_and_ref(&epoll__handle_tree, (uintptr_t) ephnd);
630   if (tree_node == NULL) {
631     err_set_win_error(ERROR_INVALID_PARAMETER);
632     goto err;
633   }
634
635   port_state = port_state_from_handle_tree_node(tree_node);
636   r = port_ctl(port_state, op, sock, ev);
637
638   ts_tree_node_unref(tree_node);
639
640   if (r < 0)
641     goto err;
642
643   return 0;
644
645 err:
646   /* On Linux, in the case of epoll_ctl(), EBADF takes priority over other
647    * errors. Wepoll mimics this behavior. */
648   err_check_handle(ephnd);
649   err_check_handle((HANDLE) sock);
650   return -1;
651 }
652
653 int epoll_wait(HANDLE ephnd,
654                struct epoll_event* events,
655                int maxevents,
656                int timeout) {
657   ts_tree_node_t* tree_node;
658   port_state_t* port_state;
659   int num_events;
660
661   if (maxevents <= 0)
662     return_set_error(-1, ERROR_INVALID_PARAMETER);
663
664   if (init() < 0)
665     return -1;
666
667   tree_node = ts_tree_find_and_ref(&epoll__handle_tree, (uintptr_t) ephnd);
668   if (tree_node == NULL) {
669     err_set_win_error(ERROR_INVALID_PARAMETER);
670     goto err;
671   }
672
673   port_state = port_state_from_handle_tree_node(tree_node);
674   num_events = port_wait(port_state, events, maxevents, timeout);
675
676   ts_tree_node_unref(tree_node);
677
678   if (num_events < 0)
679     goto err;
680
681   return num_events;
682
683 err:
684   err_check_handle(ephnd);
685   return -1;
686 }
687
688 #include <errno.h>
689
690 #define ERR__ERRNO_MAPPINGS(X)               \
691   X(ERROR_ACCESS_DENIED, EACCES)             \
692   X(ERROR_ALREADY_EXISTS, EEXIST)            \
693   X(ERROR_BAD_COMMAND, EACCES)               \
694   X(ERROR_BAD_EXE_FORMAT, ENOEXEC)           \
695   X(ERROR_BAD_LENGTH, EACCES)                \
696   X(ERROR_BAD_NETPATH, ENOENT)               \
697   X(ERROR_BAD_NET_NAME, ENOENT)              \
698   X(ERROR_BAD_NET_RESP, ENETDOWN)            \
699   X(ERROR_BAD_PATHNAME, ENOENT)              \
700   X(ERROR_BROKEN_PIPE, EPIPE)                \
701   X(ERROR_CANNOT_MAKE, EACCES)               \
702   X(ERROR_COMMITMENT_LIMIT, ENOMEM)          \
703   X(ERROR_CONNECTION_ABORTED, ECONNABORTED)  \
704   X(ERROR_CONNECTION_ACTIVE, EISCONN)        \
705   X(ERROR_CONNECTION_REFUSED, ECONNREFUSED)  \
706   X(ERROR_CRC, EACCES)                       \
707   X(ERROR_DIR_NOT_EMPTY, ENOTEMPTY)          \
708   X(ERROR_DISK_FULL, ENOSPC)                 \
709   X(ERROR_DUP_NAME, EADDRINUSE)              \
710   X(ERROR_FILENAME_EXCED_RANGE, ENOENT)      \
711   X(ERROR_FILE_NOT_FOUND, ENOENT)            \
712   X(ERROR_GEN_FAILURE, EACCES)               \
713   X(ERROR_GRACEFUL_DISCONNECT, EPIPE)        \
714   X(ERROR_HOST_DOWN, EHOSTUNREACH)           \
715   X(ERROR_HOST_UNREACHABLE, EHOSTUNREACH)    \
716   X(ERROR_INSUFFICIENT_BUFFER, EFAULT)       \
717   X(ERROR_INVALID_ADDRESS, EADDRNOTAVAIL)    \
718   X(ERROR_INVALID_FUNCTION, EINVAL)          \
719   X(ERROR_INVALID_HANDLE, EBADF)             \
720   X(ERROR_INVALID_NETNAME, EADDRNOTAVAIL)    \
721   X(ERROR_INVALID_PARAMETER, EINVAL)         \
722   X(ERROR_INVALID_USER_BUFFER, EMSGSIZE)     \
723   X(ERROR_IO_PENDING, EINPROGRESS)           \
724   X(ERROR_LOCK_VIOLATION, EACCES)            \
725   X(ERROR_MORE_DATA, EMSGSIZE)               \
726   X(ERROR_NETNAME_DELETED, ECONNABORTED)     \
727   X(ERROR_NETWORK_ACCESS_DENIED, EACCES)     \
728   X(ERROR_NETWORK_BUSY, ENETDOWN)            \
729   X(ERROR_NETWORK_UNREACHABLE, ENETUNREACH)  \
730   X(ERROR_NOACCESS, EFAULT)                  \
731   X(ERROR_NONPAGED_SYSTEM_RESOURCES, ENOMEM) \
732   X(ERROR_NOT_ENOUGH_MEMORY, ENOMEM)         \
733   X(ERROR_NOT_ENOUGH_QUOTA, ENOMEM)          \
734   X(ERROR_NOT_FOUND, ENOENT)                 \
735   X(ERROR_NOT_LOCKED, EACCES)                \
736   X(ERROR_NOT_READY, EACCES)                 \
737   X(ERROR_NOT_SAME_DEVICE, EXDEV)            \
738   X(ERROR_NOT_SUPPORTED, ENOTSUP)            \
739   X(ERROR_NO_MORE_FILES, ENOENT)             \
740   X(ERROR_NO_SYSTEM_RESOURCES, ENOMEM)       \
741   X(ERROR_OPERATION_ABORTED, EINTR)          \
742   X(ERROR_OUT_OF_PAPER, EACCES)              \
743   X(ERROR_PAGED_SYSTEM_RESOURCES, ENOMEM)    \
744   X(ERROR_PAGEFILE_QUOTA, ENOMEM)            \
745   X(ERROR_PATH_NOT_FOUND, ENOENT)            \
746   X(ERROR_PIPE_NOT_CONNECTED, EPIPE)         \
747   X(ERROR_PORT_UNREACHABLE, ECONNRESET)      \
748   X(ERROR_PROTOCOL_UNREACHABLE, ENETUNREACH) \
749   X(ERROR_REM_NOT_LIST, ECONNREFUSED)        \
750   X(ERROR_REQUEST_ABORTED, EINTR)            \
751   X(ERROR_REQ_NOT_ACCEP, EWOULDBLOCK)        \
752   X(ERROR_SECTOR_NOT_FOUND, EACCES)          \
753   X(ERROR_SEM_TIMEOUT, ETIMEDOUT)            \
754   X(ERROR_SHARING_VIOLATION, EACCES)         \
755   X(ERROR_TOO_MANY_NAMES, ENOMEM)            \
756   X(ERROR_TOO_MANY_OPEN_FILES, EMFILE)       \
757   X(ERROR_UNEXP_NET_ERR, ECONNABORTED)       \
758   X(ERROR_WAIT_NO_CHILDREN, ECHILD)          \
759   X(ERROR_WORKING_SET_QUOTA, ENOMEM)         \
760   X(ERROR_WRITE_PROTECT, EACCES)             \
761   X(ERROR_WRONG_DISK, EACCES)                \
762   X(WSAEACCES, EACCES)                       \
763   X(WSAEADDRINUSE, EADDRINUSE)               \
764   X(WSAEADDRNOTAVAIL, EADDRNOTAVAIL)         \
765   X(WSAEAFNOSUPPORT, EAFNOSUPPORT)           \
766   X(WSAECONNABORTED, ECONNABORTED)           \
767   X(WSAECONNREFUSED, ECONNREFUSED)           \
768   X(WSAECONNRESET, ECONNRESET)               \
769   X(WSAEDISCON, EPIPE)                       \
770   X(WSAEFAULT, EFAULT)                       \
771   X(WSAEHOSTDOWN, EHOSTUNREACH)              \
772   X(WSAEHOSTUNREACH, EHOSTUNREACH)           \
773   X(WSAEINPROGRESS, EBUSY)                   \
774   X(WSAEINTR, EINTR)                         \
775   X(WSAEINVAL, EINVAL)                       \
776   X(WSAEISCONN, EISCONN)                     \
777   X(WSAEMSGSIZE, EMSGSIZE)                   \
778   X(WSAENETDOWN, ENETDOWN)                   \
779   X(WSAENETRESET, EHOSTUNREACH)              \
780   X(WSAENETUNREACH, ENETUNREACH)             \
781   X(WSAENOBUFS, ENOMEM)                      \
782   X(WSAENOTCONN, ENOTCONN)                   \
783   X(WSAENOTSOCK, ENOTSOCK)                   \
784   X(WSAEOPNOTSUPP, EOPNOTSUPP)               \
785   X(WSAEPROCLIM, ENOMEM)                     \
786   X(WSAESHUTDOWN, EPIPE)                     \
787   X(WSAETIMEDOUT, ETIMEDOUT)                 \
788   X(WSAEWOULDBLOCK, EWOULDBLOCK)             \
789   X(WSANOTINITIALISED, ENETDOWN)             \
790   X(WSASYSNOTREADY, ENETDOWN)                \
791   X(WSAVERNOTSUPPORTED, ENOSYS)
792
793 static errno_t err__map_win_error_to_errno(DWORD error) {
794   switch (error) {
795 #define X(error_sym, errno_sym) \
796   case error_sym:               \
797     return errno_sym;
798     ERR__ERRNO_MAPPINGS(X)
799 #undef X
800   }
801   return EINVAL;
802 }
803
804 void err_map_win_error(void) {
805   errno = err__map_win_error_to_errno(GetLastError());
806 }
807
808 void err_set_win_error(DWORD error) {
809   SetLastError(error);
810   errno = err__map_win_error_to_errno(error);
811 }
812
813 int err_check_handle(HANDLE handle) {
814   DWORD flags;
815
816   /* GetHandleInformation() succeeds when passed INVALID_HANDLE_VALUE, so check
817    * for this condition explicitly. */
818   if (handle == INVALID_HANDLE_VALUE)
819     return_set_error(-1, ERROR_INVALID_HANDLE);
820
821   if (!GetHandleInformation(handle, &flags))
822     return_map_error(-1);
823
824   return 0;
825 }
826
827 #include <stddef.h>
828
829 #define array_count(a) (sizeof(a) / (sizeof((a)[0])))
830
831 #define container_of(ptr, type, member) \
832   ((type*) ((uintptr_t) (ptr) - offsetof(type, member)))
833
834 #define unused_var(v) ((void) (v))
835
836 /* Polyfill `inline` for older versions of msvc (up to Visual Studio 2013) */
837 #if defined(_MSC_VER) && _MSC_VER < 1900
838 #define inline __inline
839 #endif
840
841 WEPOLL_INTERNAL int ws_global_init(void);
842 WEPOLL_INTERNAL SOCKET ws_get_base_socket(SOCKET socket);
843
844 static bool init__done = false;
845 static INIT_ONCE init__once = INIT_ONCE_STATIC_INIT;
846
847 static BOOL CALLBACK init__once_callback(INIT_ONCE* once,
848                                          void* parameter,
849                                          void** context) {
850   unused_var(once);
851   unused_var(parameter);
852   unused_var(context);
853
854   /* N.b. that initialization order matters here. */
855   if (ws_global_init() < 0 || nt_global_init() < 0 ||
856       reflock_global_init() < 0 || epoll_global_init() < 0)
857     return FALSE;
858
859   init__done = true;
860   return TRUE;
861 }
862
863 int init(void) {
864   if (!init__done &&
865       !InitOnceExecuteOnce(&init__once, init__once_callback, NULL, NULL))
866     /* `InitOnceExecuteOnce()` itself is infallible, and it doesn't set any
867      * error code when the once-callback returns FALSE. We return -1 here to
868      * indicate that global initialization failed; the failing init function is
869      * resposible for setting `errno` and calling `SetLastError()`. */
870     return -1;
871
872   return 0;
873 }
874
875 /* Set up a workaround for the following problem:
876  *   FARPROC addr = GetProcAddress(...);
877  *   MY_FUNC func = (MY_FUNC) addr;          <-- GCC 8 warning/error.
878  *   MY_FUNC func = (MY_FUNC) (void*) addr;  <-- MSVC  warning/error.
879  * To compile cleanly with either compiler, do casts with this "bridge" type:
880  *   MY_FUNC func = (MY_FUNC) (nt__fn_ptr_cast_t) addr; */
881 #ifdef __GNUC__
882 typedef void* nt__fn_ptr_cast_t;
883 #else
884 typedef FARPROC nt__fn_ptr_cast_t;
885 #endif
886
887 #define X(return_type, attributes, name, parameters) \
888   WEPOLL_INTERNAL return_type(attributes* name) parameters = NULL;
889 NT_NTDLL_IMPORT_LIST(X)
890 #undef X
891
892 int nt_global_init(void) {
893   HMODULE ntdll;
894   FARPROC fn_ptr;
895
896   ntdll = GetModuleHandleW(L"ntdll.dll");
897   if (ntdll == NULL)
898     return -1;
899
900 #define X(return_type, attributes, name, parameters) \
901   fn_ptr = GetProcAddress(ntdll, #name);             \
902   if (fn_ptr == NULL)                                \
903     return -1;                                       \
904   name = (return_type(attributes*) parameters)(nt__fn_ptr_cast_t) fn_ptr;
905   NT_NTDLL_IMPORT_LIST(X)
906 #undef X
907
908   return 0;
909 }
910
911 #include <string.h>
912
913 typedef struct poll_group poll_group_t;
914
915 typedef struct queue_node queue_node_t;
916
917 WEPOLL_INTERNAL poll_group_t* poll_group_acquire(port_state_t* port);
918 WEPOLL_INTERNAL void poll_group_release(poll_group_t* poll_group);
919
920 WEPOLL_INTERNAL void poll_group_delete(poll_group_t* poll_group);
921
922 WEPOLL_INTERNAL poll_group_t* poll_group_from_queue_node(
923     queue_node_t* queue_node);
924 WEPOLL_INTERNAL HANDLE
925     poll_group_get_afd_helper_handle(poll_group_t* poll_group);
926
927 typedef struct queue_node {
928   queue_node_t* prev;
929   queue_node_t* next;
930 } queue_node_t;
931
932 typedef struct queue {
933   queue_node_t head;
934 } queue_t;
935
936 WEPOLL_INTERNAL void queue_init(queue_t* queue);
937 WEPOLL_INTERNAL void queue_node_init(queue_node_t* node);
938
939 WEPOLL_INTERNAL queue_node_t* queue_first(const queue_t* queue);
940 WEPOLL_INTERNAL queue_node_t* queue_last(const queue_t* queue);
941
942 WEPOLL_INTERNAL void queue_prepend(queue_t* queue, queue_node_t* node);
943 WEPOLL_INTERNAL void queue_append(queue_t* queue, queue_node_t* node);
944 WEPOLL_INTERNAL void queue_move_first(queue_t* queue, queue_node_t* node);
945 WEPOLL_INTERNAL void queue_move_last(queue_t* queue, queue_node_t* node);
946 WEPOLL_INTERNAL void queue_remove(queue_node_t* node);
947
948 WEPOLL_INTERNAL bool queue_empty(const queue_t* queue);
949 WEPOLL_INTERNAL bool queue_enqueued(const queue_node_t* node);
950
951 static const size_t POLL_GROUP__MAX_GROUP_SIZE = 32;
952
953 typedef struct poll_group {
954   port_state_t* port_state;
955   queue_node_t queue_node;
956   HANDLE afd_helper_handle;
957   size_t group_size;
958 } poll_group_t;
959
960 static poll_group_t* poll_group__new(port_state_t* port_state) {
961   HANDLE iocp_handle = port_get_iocp_handle(port_state);
962   queue_t* poll_group_queue = port_get_poll_group_queue(port_state);
963
964   poll_group_t* poll_group = malloc(sizeof *poll_group);
965   if (poll_group == NULL)
966     return_set_error(NULL, ERROR_NOT_ENOUGH_MEMORY);
967
968   memset(poll_group, 0, sizeof *poll_group);
969
970   queue_node_init(&poll_group->queue_node);
971   poll_group->port_state = port_state;
972
973   if (afd_create_helper_handle(iocp_handle, &poll_group->afd_helper_handle) <
974       0) {
975     free(poll_group);
976     return NULL;
977   }
978
979   queue_append(poll_group_queue, &poll_group->queue_node);
980
981   return poll_group;
982 }
983
984 void poll_group_delete(poll_group_t* poll_group) {
985   assert(poll_group->group_size == 0);
986   CloseHandle(poll_group->afd_helper_handle);
987   queue_remove(&poll_group->queue_node);
988   free(poll_group);
989 }
990
991 poll_group_t* poll_group_from_queue_node(queue_node_t* queue_node) {
992   return container_of(queue_node, poll_group_t, queue_node);
993 }
994
995 HANDLE poll_group_get_afd_helper_handle(poll_group_t* poll_group) {
996   return poll_group->afd_helper_handle;
997 }
998
999 poll_group_t* poll_group_acquire(port_state_t* port_state) {
1000   queue_t* poll_group_queue = port_get_poll_group_queue(port_state);
1001   poll_group_t* poll_group =
1002       !queue_empty(poll_group_queue)
1003           ? container_of(
1004                 queue_last(poll_group_queue), poll_group_t, queue_node)
1005           : NULL;
1006
1007   if (poll_group == NULL ||
1008       poll_group->group_size >= POLL_GROUP__MAX_GROUP_SIZE)
1009     poll_group = poll_group__new(port_state);
1010   if (poll_group == NULL)
1011     return NULL;
1012
1013   if (++poll_group->group_size == POLL_GROUP__MAX_GROUP_SIZE)
1014     queue_move_first(poll_group_queue, &poll_group->queue_node);
1015
1016   return poll_group;
1017 }
1018
1019 void poll_group_release(poll_group_t* poll_group) {
1020   port_state_t* port_state = poll_group->port_state;
1021   queue_t* poll_group_queue = port_get_poll_group_queue(port_state);
1022
1023   poll_group->group_size--;
1024   assert(poll_group->group_size < POLL_GROUP__MAX_GROUP_SIZE);
1025
1026   queue_move_last(poll_group_queue, &poll_group->queue_node);
1027
1028   /* Poll groups are currently only freed when the epoll port is closed. */
1029 }
1030
1031 WEPOLL_INTERNAL sock_state_t* sock_new(port_state_t* port_state,
1032                                        SOCKET socket);
1033 WEPOLL_INTERNAL void sock_delete(port_state_t* port_state,
1034                                  sock_state_t* sock_state);
1035 WEPOLL_INTERNAL void sock_force_delete(port_state_t* port_state,
1036                                        sock_state_t* sock_state);
1037
1038 WEPOLL_INTERNAL int sock_set_event(port_state_t* port_state,
1039                                    sock_state_t* sock_state,
1040                                    const struct epoll_event* ev);
1041
1042 WEPOLL_INTERNAL int sock_update(port_state_t* port_state,
1043                                 sock_state_t* sock_state);
1044 WEPOLL_INTERNAL int sock_feed_event(port_state_t* port_state,
1045                                     IO_STATUS_BLOCK* io_status_block,
1046                                     struct epoll_event* ev);
1047
1048 WEPOLL_INTERNAL sock_state_t* sock_state_from_queue_node(
1049     queue_node_t* queue_node);
1050 WEPOLL_INTERNAL queue_node_t* sock_state_to_queue_node(
1051     sock_state_t* sock_state);
1052 WEPOLL_INTERNAL sock_state_t* sock_state_from_tree_node(
1053     tree_node_t* tree_node);
1054 WEPOLL_INTERNAL tree_node_t* sock_state_to_tree_node(sock_state_t* sock_state);
1055
1056 #define PORT__MAX_ON_STACK_COMPLETIONS 256
1057
1058 typedef struct port_state {
1059   HANDLE iocp_handle;
1060   tree_t sock_tree;
1061   queue_t sock_update_queue;
1062   queue_t sock_deleted_queue;
1063   queue_t poll_group_queue;
1064   ts_tree_node_t handle_tree_node;
1065   CRITICAL_SECTION lock;
1066   size_t active_poll_count;
1067 } port_state_t;
1068
1069 static port_state_t* port__alloc(void) {
1070   port_state_t* port_state = malloc(sizeof *port_state);
1071   if (port_state == NULL)
1072     return_set_error(NULL, ERROR_NOT_ENOUGH_MEMORY);
1073
1074   return port_state;
1075 }
1076
1077 static void port__free(port_state_t* port) {
1078   assert(port != NULL);
1079   free(port);
1080 }
1081
1082 static HANDLE port__create_iocp(void) {
1083   HANDLE iocp_handle =
1084       CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0);
1085   if (iocp_handle == NULL)
1086     return_map_error(NULL);
1087
1088   return iocp_handle;
1089 }
1090
1091 port_state_t* port_new(HANDLE* iocp_handle_out) {
1092   port_state_t* port_state;
1093   HANDLE iocp_handle;
1094
1095   port_state = port__alloc();
1096   if (port_state == NULL)
1097     goto err1;
1098
1099   iocp_handle = port__create_iocp();
1100   if (iocp_handle == NULL)
1101     goto err2;
1102
1103   memset(port_state, 0, sizeof *port_state);
1104
1105   port_state->iocp_handle = iocp_handle;
1106   tree_init(&port_state->sock_tree);
1107   queue_init(&port_state->sock_update_queue);
1108   queue_init(&port_state->sock_deleted_queue);
1109   queue_init(&port_state->poll_group_queue);
1110   ts_tree_node_init(&port_state->handle_tree_node);
1111   InitializeCriticalSection(&port_state->lock);
1112
1113   *iocp_handle_out = iocp_handle;
1114   return port_state;
1115
1116 err2:
1117   port__free(port_state);
1118 err1:
1119   return NULL;
1120 }
1121
1122 static int port__close_iocp(port_state_t* port_state) {
1123   HANDLE iocp_handle = port_state->iocp_handle;
1124   port_state->iocp_handle = NULL;
1125
1126   if (!CloseHandle(iocp_handle))
1127     return_map_error(-1);
1128
1129   return 0;
1130 }
1131
1132 int port_close(port_state_t* port_state) {
1133   int result;
1134
1135   EnterCriticalSection(&port_state->lock);
1136   result = port__close_iocp(port_state);
1137   LeaveCriticalSection(&port_state->lock);
1138
1139   return result;
1140 }
1141
1142 int port_delete(port_state_t* port_state) {
1143   tree_node_t* tree_node;
1144   queue_node_t* queue_node;
1145
1146   /* At this point the IOCP port should have been closed. */
1147   assert(port_state->iocp_handle == NULL);
1148
1149   while ((tree_node = tree_root(&port_state->sock_tree)) != NULL) {
1150     sock_state_t* sock_state = sock_state_from_tree_node(tree_node);
1151     sock_force_delete(port_state, sock_state);
1152   }
1153
1154   while ((queue_node = queue_first(&port_state->sock_deleted_queue)) != NULL) {
1155     sock_state_t* sock_state = sock_state_from_queue_node(queue_node);
1156     sock_force_delete(port_state, sock_state);
1157   }
1158
1159   while ((queue_node = queue_first(&port_state->poll_group_queue)) != NULL) {
1160     poll_group_t* poll_group = poll_group_from_queue_node(queue_node);
1161     poll_group_delete(poll_group);
1162   }
1163
1164   assert(queue_empty(&port_state->sock_update_queue));
1165
1166   DeleteCriticalSection(&port_state->lock);
1167
1168   port__free(port_state);
1169
1170   return 0;
1171 }
1172
1173 static int port__update_events(port_state_t* port_state) {
1174   queue_t* sock_update_queue = &port_state->sock_update_queue;
1175
1176   /* Walk the queue, submitting new poll requests for every socket that needs
1177    * it. */
1178   while (!queue_empty(sock_update_queue)) {
1179     queue_node_t* queue_node = queue_first(sock_update_queue);
1180     sock_state_t* sock_state = sock_state_from_queue_node(queue_node);
1181
1182     if (sock_update(port_state, sock_state) < 0)
1183       return -1;
1184
1185     /* sock_update() removes the socket from the update queue. */
1186   }
1187
1188   return 0;
1189 }
1190
1191 static void port__update_events_if_polling(port_state_t* port_state) {
1192   if (port_state->active_poll_count > 0)
1193     port__update_events(port_state);
1194 }
1195
1196 static int port__feed_events(port_state_t* port_state,
1197                              struct epoll_event* epoll_events,
1198                              OVERLAPPED_ENTRY* iocp_events,
1199                              DWORD iocp_event_count) {
1200   int epoll_event_count = 0;
1201   DWORD i;
1202
1203   for (i = 0; i < iocp_event_count; i++) {
1204     IO_STATUS_BLOCK* io_status_block =
1205         (IO_STATUS_BLOCK*) iocp_events[i].lpOverlapped;
1206     struct epoll_event* ev = &epoll_events[epoll_event_count];
1207
1208     epoll_event_count += sock_feed_event(port_state, io_status_block, ev);
1209   }
1210
1211   return epoll_event_count;
1212 }
1213
1214 static int port__poll(port_state_t* port_state,
1215                       struct epoll_event* epoll_events,
1216                       OVERLAPPED_ENTRY* iocp_events,
1217                       DWORD maxevents,
1218                       DWORD timeout) {
1219   DWORD completion_count;
1220
1221   if (port__update_events(port_state) < 0)
1222     return -1;
1223
1224   port_state->active_poll_count++;
1225
1226   LeaveCriticalSection(&port_state->lock);
1227
1228   BOOL r = GetQueuedCompletionStatusEx(port_state->iocp_handle,
1229                                        iocp_events,
1230                                        maxevents,
1231                                        &completion_count,
1232                                        timeout,
1233                                        FALSE);
1234
1235   EnterCriticalSection(&port_state->lock);
1236
1237   port_state->active_poll_count--;
1238
1239   if (!r)
1240     return_map_error(-1);
1241
1242   return port__feed_events(
1243       port_state, epoll_events, iocp_events, completion_count);
1244 }
1245
1246 int port_wait(port_state_t* port_state,
1247               struct epoll_event* events,
1248               int maxevents,
1249               int timeout) {
1250   OVERLAPPED_ENTRY stack_iocp_events[PORT__MAX_ON_STACK_COMPLETIONS];
1251   OVERLAPPED_ENTRY* iocp_events;
1252   uint64_t due = 0;
1253   DWORD gqcs_timeout;
1254   int result;
1255
1256   /* Check whether `maxevents` is in range. */
1257   if (maxevents <= 0)
1258     return_set_error(-1, ERROR_INVALID_PARAMETER);
1259
1260   /* Decide whether the IOCP completion list can live on the stack, or allocate
1261    * memory for it on the heap. */
1262   if ((size_t) maxevents <= array_count(stack_iocp_events)) {
1263     iocp_events = stack_iocp_events;
1264   } else if ((iocp_events =
1265                   malloc((size_t) maxevents * sizeof *iocp_events)) == NULL) {
1266     iocp_events = stack_iocp_events;
1267     maxevents = array_count(stack_iocp_events);
1268   }
1269
1270   /* Compute the timeout for GetQueuedCompletionStatus, and the wait end
1271    * time, if the user specified a timeout other than zero or infinite. */
1272   if (timeout > 0) {
1273     due = GetTickCount64() + (uint64_t) timeout;
1274     gqcs_timeout = (DWORD) timeout;
1275   } else if (timeout == 0) {
1276     gqcs_timeout = 0;
1277   } else {
1278     gqcs_timeout = INFINITE;
1279   }
1280
1281   EnterCriticalSection(&port_state->lock);
1282
1283   /* Dequeue completion packets until either at least one interesting event
1284    * has been discovered, or the timeout is reached. */
1285   for (;;) {
1286     uint64_t now;
1287
1288     result = port__poll(
1289         port_state, events, iocp_events, (DWORD) maxevents, gqcs_timeout);
1290     if (result < 0 || result > 0)
1291       break; /* Result, error, or time-out. */
1292
1293     if (timeout < 0)
1294       continue; /* When timeout is negative, never time out. */
1295
1296     /* Update time. */
1297     now = GetTickCount64();
1298
1299     /* Do not allow the due time to be in the past. */
1300     if (now >= due) {
1301       SetLastError(WAIT_TIMEOUT);
1302       break;
1303     }
1304
1305     /* Recompute time-out argument for GetQueuedCompletionStatus. */
1306     gqcs_timeout = (DWORD)(due - now);
1307   }
1308
1309   port__update_events_if_polling(port_state);
1310
1311   LeaveCriticalSection(&port_state->lock);
1312
1313   if (iocp_events != stack_iocp_events)
1314     free(iocp_events);
1315
1316   if (result >= 0)
1317     return result;
1318   else if (GetLastError() == WAIT_TIMEOUT)
1319     return 0;
1320   else
1321     return -1;
1322 }
1323
1324 static int port__ctl_add(port_state_t* port_state,
1325                          SOCKET sock,
1326                          struct epoll_event* ev) {
1327   sock_state_t* sock_state = sock_new(port_state, sock);
1328   if (sock_state == NULL)
1329     return -1;
1330
1331   if (sock_set_event(port_state, sock_state, ev) < 0) {
1332     sock_delete(port_state, sock_state);
1333     return -1;
1334   }
1335
1336   port__update_events_if_polling(port_state);
1337
1338   return 0;
1339 }
1340
1341 static int port__ctl_mod(port_state_t* port_state,
1342                          SOCKET sock,
1343                          struct epoll_event* ev) {
1344   sock_state_t* sock_state = port_find_socket(port_state, sock);
1345   if (sock_state == NULL)
1346     return -1;
1347
1348   if (sock_set_event(port_state, sock_state, ev) < 0)
1349     return -1;
1350
1351   port__update_events_if_polling(port_state);
1352
1353   return 0;
1354 }
1355
1356 static int port__ctl_del(port_state_t* port_state, SOCKET sock) {
1357   sock_state_t* sock_state = port_find_socket(port_state, sock);
1358   if (sock_state == NULL)
1359     return -1;
1360
1361   sock_delete(port_state, sock_state);
1362
1363   return 0;
1364 }
1365
1366 static int port__ctl_op(port_state_t* port_state,
1367                         int op,
1368                         SOCKET sock,
1369                         struct epoll_event* ev) {
1370   switch (op) {
1371     case EPOLL_CTL_ADD:
1372       return port__ctl_add(port_state, sock, ev);
1373     case EPOLL_CTL_MOD:
1374       return port__ctl_mod(port_state, sock, ev);
1375     case EPOLL_CTL_DEL:
1376       return port__ctl_del(port_state, sock);
1377     default:
1378       return_set_error(-1, ERROR_INVALID_PARAMETER);
1379   }
1380 }
1381
1382 int port_ctl(port_state_t* port_state,
1383              int op,
1384              SOCKET sock,
1385              struct epoll_event* ev) {
1386   int result;
1387
1388   EnterCriticalSection(&port_state->lock);
1389   result = port__ctl_op(port_state, op, sock, ev);
1390   LeaveCriticalSection(&port_state->lock);
1391
1392   return result;
1393 }
1394
1395 int port_register_socket_handle(port_state_t* port_state,
1396                                 sock_state_t* sock_state,
1397                                 SOCKET socket) {
1398   if (tree_add(&port_state->sock_tree,
1399                sock_state_to_tree_node(sock_state),
1400                socket) < 0)
1401     return_set_error(-1, ERROR_ALREADY_EXISTS);
1402   return 0;
1403 }
1404
1405 void port_unregister_socket_handle(port_state_t* port_state,
1406                                    sock_state_t* sock_state) {
1407   tree_del(&port_state->sock_tree, sock_state_to_tree_node(sock_state));
1408 }
1409
1410 sock_state_t* port_find_socket(port_state_t* port_state, SOCKET socket) {
1411   tree_node_t* tree_node = tree_find(&port_state->sock_tree, socket);
1412   if (tree_node == NULL)
1413     return_set_error(NULL, ERROR_NOT_FOUND);
1414   return sock_state_from_tree_node(tree_node);
1415 }
1416
1417 void port_request_socket_update(port_state_t* port_state,
1418                                 sock_state_t* sock_state) {
1419   if (queue_enqueued(sock_state_to_queue_node(sock_state)))
1420     return;
1421   queue_append(&port_state->sock_update_queue,
1422                sock_state_to_queue_node(sock_state));
1423 }
1424
1425 void port_cancel_socket_update(port_state_t* port_state,
1426                                sock_state_t* sock_state) {
1427   unused_var(port_state);
1428   if (!queue_enqueued(sock_state_to_queue_node(sock_state)))
1429     return;
1430   queue_remove(sock_state_to_queue_node(sock_state));
1431 }
1432
1433 void port_add_deleted_socket(port_state_t* port_state,
1434                              sock_state_t* sock_state) {
1435   if (queue_enqueued(sock_state_to_queue_node(sock_state)))
1436     return;
1437   queue_append(&port_state->sock_deleted_queue,
1438                sock_state_to_queue_node(sock_state));
1439 }
1440
1441 void port_remove_deleted_socket(port_state_t* port_state,
1442                                 sock_state_t* sock_state) {
1443   unused_var(port_state);
1444   if (!queue_enqueued(sock_state_to_queue_node(sock_state)))
1445     return;
1446   queue_remove(sock_state_to_queue_node(sock_state));
1447 }
1448
1449 HANDLE port_get_iocp_handle(port_state_t* port_state) {
1450   assert(port_state->iocp_handle != NULL);
1451   return port_state->iocp_handle;
1452 }
1453
1454 queue_t* port_get_poll_group_queue(port_state_t* port_state) {
1455   return &port_state->poll_group_queue;
1456 }
1457
1458 port_state_t* port_state_from_handle_tree_node(ts_tree_node_t* tree_node) {
1459   return container_of(tree_node, port_state_t, handle_tree_node);
1460 }
1461
1462 ts_tree_node_t* port_state_to_handle_tree_node(port_state_t* port_state) {
1463   return &port_state->handle_tree_node;
1464 }
1465
1466 void queue_init(queue_t* queue) {
1467   queue_node_init(&queue->head);
1468 }
1469
1470 void queue_node_init(queue_node_t* node) {
1471   node->prev = node;
1472   node->next = node;
1473 }
1474
1475 static inline void queue__detach_node(queue_node_t* node) {
1476   node->prev->next = node->next;
1477   node->next->prev = node->prev;
1478 }
1479
1480 queue_node_t* queue_first(const queue_t* queue) {
1481   return !queue_empty(queue) ? queue->head.next : NULL;
1482 }
1483
1484 queue_node_t* queue_last(const queue_t* queue) {
1485   return !queue_empty(queue) ? queue->head.prev : NULL;
1486 }
1487
1488 void queue_prepend(queue_t* queue, queue_node_t* node) {
1489   node->next = queue->head.next;
1490   node->prev = &queue->head;
1491   node->next->prev = node;
1492   queue->head.next = node;
1493 }
1494
1495 void queue_append(queue_t* queue, queue_node_t* node) {
1496   node->next = &queue->head;
1497   node->prev = queue->head.prev;
1498   node->prev->next = node;
1499   queue->head.prev = node;
1500 }
1501
1502 void queue_move_first(queue_t* queue, queue_node_t* node) {
1503   queue__detach_node(node);
1504   queue_prepend(queue, node);
1505 }
1506
1507 void queue_move_last(queue_t* queue, queue_node_t* node) {
1508   queue__detach_node(node);
1509   queue_append(queue, node);
1510 }
1511
1512 void queue_remove(queue_node_t* node) {
1513   queue__detach_node(node);
1514   queue_node_init(node);
1515 }
1516
1517 bool queue_empty(const queue_t* queue) {
1518   return !queue_enqueued(&queue->head);
1519 }
1520
1521 bool queue_enqueued(const queue_node_t* node) {
1522   return node->prev != node;
1523 }
1524
1525 static const long REFLOCK__REF          = (long) 0x00000001;
1526 static const long REFLOCK__REF_MASK     = (long) 0x0fffffff;
1527 static const long REFLOCK__DESTROY      = (long) 0x10000000;
1528 static const long REFLOCK__DESTROY_MASK = (long) 0xf0000000;
1529 static const long REFLOCK__POISON       = (long) 0x300dead0;
1530
1531 static HANDLE reflock__keyed_event = NULL;
1532
1533 int reflock_global_init(void) {
1534   NTSTATUS status = NtCreateKeyedEvent(
1535       &reflock__keyed_event, KEYEDEVENT_ALL_ACCESS, NULL, 0);
1536   if (status != STATUS_SUCCESS)
1537     return_set_error(-1, RtlNtStatusToDosError(status));
1538   return 0;
1539 }
1540
1541 void reflock_init(reflock_t* reflock) {
1542   reflock->state = 0;
1543 }
1544
1545 static void reflock__signal_event(void* address) {
1546   NTSTATUS status =
1547       NtReleaseKeyedEvent(reflock__keyed_event, address, FALSE, NULL);
1548   if (status != STATUS_SUCCESS)
1549     abort();
1550 }
1551
1552 static void reflock__await_event(void* address) {
1553   NTSTATUS status =
1554       NtWaitForKeyedEvent(reflock__keyed_event, address, FALSE, NULL);
1555   if (status != STATUS_SUCCESS)
1556     abort();
1557 }
1558
1559 void reflock_ref(reflock_t* reflock) {
1560   long state = InterlockedAdd(&reflock->state, REFLOCK__REF);
1561
1562   /* Verify that the counter didn't overflow and the lock isn't destroyed. */
1563   assert((state & REFLOCK__DESTROY_MASK) == 0);
1564   unused_var(state);
1565 }
1566
1567 void reflock_unref(reflock_t* reflock) {
1568   long state = InterlockedAdd(&reflock->state, -REFLOCK__REF);
1569
1570   /* Verify that the lock was referenced and not already destroyed. */
1571   assert((state & REFLOCK__DESTROY_MASK & ~REFLOCK__DESTROY) == 0);
1572
1573   if (state == REFLOCK__DESTROY)
1574     reflock__signal_event(reflock);
1575 }
1576
1577 void reflock_unref_and_destroy(reflock_t* reflock) {
1578   long state =
1579       InterlockedAdd(&reflock->state, REFLOCK__DESTROY - REFLOCK__REF);
1580   long ref_count = state & REFLOCK__REF_MASK;
1581
1582   /* Verify that the lock was referenced and not already destroyed. */
1583   assert((state & REFLOCK__DESTROY_MASK) == REFLOCK__DESTROY);
1584
1585   if (ref_count != 0)
1586     reflock__await_event(reflock);
1587
1588   state = InterlockedExchange(&reflock->state, REFLOCK__POISON);
1589   assert(state == REFLOCK__DESTROY);
1590 }
1591
1592 static const uint32_t SOCK__KNOWN_EPOLL_EVENTS =
1593     EPOLLIN | EPOLLPRI | EPOLLOUT | EPOLLERR | EPOLLHUP | EPOLLRDNORM |
1594     EPOLLRDBAND | EPOLLWRNORM | EPOLLWRBAND | EPOLLMSG | EPOLLRDHUP;
1595
1596 typedef enum sock__poll_status {
1597   SOCK__POLL_IDLE = 0,
1598   SOCK__POLL_PENDING,
1599   SOCK__POLL_CANCELLED
1600 } sock__poll_status_t;
1601
1602 typedef struct sock_state {
1603   IO_STATUS_BLOCK io_status_block;
1604   AFD_POLL_INFO poll_info;
1605   queue_node_t queue_node;
1606   tree_node_t tree_node;
1607   poll_group_t* poll_group;
1608   SOCKET base_socket;
1609   epoll_data_t user_data;
1610   uint32_t user_events;
1611   uint32_t pending_events;
1612   sock__poll_status_t poll_status;
1613   bool delete_pending;
1614 } sock_state_t;
1615
1616 static inline sock_state_t* sock__alloc(void) {
1617   sock_state_t* sock_state = malloc(sizeof *sock_state);
1618   if (sock_state == NULL)
1619     return_set_error(NULL, ERROR_NOT_ENOUGH_MEMORY);
1620   return sock_state;
1621 }
1622
1623 static inline void sock__free(sock_state_t* sock_state) {
1624   free(sock_state);
1625 }
1626
1627 static int sock__cancel_poll(sock_state_t* sock_state) {
1628   assert(sock_state->poll_status == SOCK__POLL_PENDING);
1629
1630   if (afd_cancel_poll(poll_group_get_afd_helper_handle(sock_state->poll_group),
1631                       &sock_state->io_status_block) < 0)
1632     return -1;
1633
1634   sock_state->poll_status = SOCK__POLL_CANCELLED;
1635   sock_state->pending_events = 0;
1636   return 0;
1637 }
1638
1639 sock_state_t* sock_new(port_state_t* port_state, SOCKET socket) {
1640   SOCKET base_socket;
1641   poll_group_t* poll_group;
1642   sock_state_t* sock_state;
1643
1644   if (socket == 0 || socket == INVALID_SOCKET)
1645     return_set_error(NULL, ERROR_INVALID_HANDLE);
1646
1647   base_socket = ws_get_base_socket(socket);
1648   if (base_socket == INVALID_SOCKET)
1649     return NULL;
1650
1651   poll_group = poll_group_acquire(port_state);
1652   if (poll_group == NULL)
1653     return NULL;
1654
1655   sock_state = sock__alloc();
1656   if (sock_state == NULL)
1657     goto err1;
1658
1659   memset(sock_state, 0, sizeof *sock_state);
1660
1661   sock_state->base_socket = base_socket;
1662   sock_state->poll_group = poll_group;
1663
1664   tree_node_init(&sock_state->tree_node);
1665   queue_node_init(&sock_state->queue_node);
1666
1667   if (port_register_socket_handle(port_state, sock_state, socket) < 0)
1668     goto err2;
1669
1670   return sock_state;
1671
1672 err2:
1673   sock__free(sock_state);
1674 err1:
1675   poll_group_release(poll_group);
1676
1677   return NULL;
1678 }
1679
1680 static int sock__delete(port_state_t* port_state,
1681                         sock_state_t* sock_state,
1682                         bool force) {
1683   if (!sock_state->delete_pending) {
1684     if (sock_state->poll_status == SOCK__POLL_PENDING)
1685       sock__cancel_poll(sock_state);
1686
1687     port_cancel_socket_update(port_state, sock_state);
1688     port_unregister_socket_handle(port_state, sock_state);
1689
1690     sock_state->delete_pending = true;
1691   }
1692
1693   /* If the poll request still needs to complete, the sock_state object can't
1694    * be free()d yet. `sock_feed_event()` or `port_close()` will take care
1695    * of this later. */
1696   if (force || sock_state->poll_status == SOCK__POLL_IDLE) {
1697     /* Free the sock_state now. */
1698     port_remove_deleted_socket(port_state, sock_state);
1699     poll_group_release(sock_state->poll_group);
1700     sock__free(sock_state);
1701   } else {
1702     /* Free the socket later. */
1703     port_add_deleted_socket(port_state, sock_state);
1704   }
1705
1706   return 0;
1707 }
1708
1709 void sock_delete(port_state_t* port_state, sock_state_t* sock_state) {
1710   sock__delete(port_state, sock_state, false);
1711 }
1712
1713 void sock_force_delete(port_state_t* port_state, sock_state_t* sock_state) {
1714   sock__delete(port_state, sock_state, true);
1715 }
1716
1717 int sock_set_event(port_state_t* port_state,
1718                    sock_state_t* sock_state,
1719                    const struct epoll_event* ev) {
1720   /* EPOLLERR and EPOLLHUP are always reported, even when not requested by the
1721    * caller. However they are disabled after a event has been reported for a
1722    * socket for which the EPOLLONESHOT flag as set. */
1723   uint32_t events = ev->events | EPOLLERR | EPOLLHUP;
1724
1725   sock_state->user_events = events;
1726   sock_state->user_data = ev->data;
1727
1728   if ((events & SOCK__KNOWN_EPOLL_EVENTS & ~sock_state->pending_events) != 0)
1729     port_request_socket_update(port_state, sock_state);
1730
1731   return 0;
1732 }
1733
1734 static inline DWORD sock__epoll_events_to_afd_events(uint32_t epoll_events) {
1735   /* Always monitor for AFD_POLL_LOCAL_CLOSE, which is triggered when the
1736    * socket is closed with closesocket() or CloseHandle(). */
1737   DWORD afd_events = AFD_POLL_LOCAL_CLOSE;
1738
1739   if (epoll_events & (EPOLLIN | EPOLLRDNORM))
1740     afd_events |= AFD_POLL_RECEIVE | AFD_POLL_ACCEPT;
1741   if (epoll_events & (EPOLLPRI | EPOLLRDBAND))
1742     afd_events |= AFD_POLL_RECEIVE_EXPEDITED;
1743   if (epoll_events & (EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND))
1744     afd_events |= AFD_POLL_SEND;
1745   if (epoll_events & (EPOLLIN | EPOLLRDNORM | EPOLLRDHUP))
1746     afd_events |= AFD_POLL_DISCONNECT;
1747   if (epoll_events & EPOLLHUP)
1748     afd_events |= AFD_POLL_ABORT;
1749   if (epoll_events & EPOLLERR)
1750     afd_events |= AFD_POLL_CONNECT_FAIL;
1751
1752   return afd_events;
1753 }
1754
1755 static inline uint32_t sock__afd_events_to_epoll_events(DWORD afd_events) {
1756   uint32_t epoll_events = 0;
1757
1758   if (afd_events & (AFD_POLL_RECEIVE | AFD_POLL_ACCEPT))
1759     epoll_events |= EPOLLIN | EPOLLRDNORM;
1760   if (afd_events & AFD_POLL_RECEIVE_EXPEDITED)
1761     epoll_events |= EPOLLPRI | EPOLLRDBAND;
1762   if (afd_events & AFD_POLL_SEND)
1763     epoll_events |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
1764   if (afd_events & AFD_POLL_DISCONNECT)
1765     epoll_events |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1766   if (afd_events & AFD_POLL_ABORT)
1767     epoll_events |= EPOLLHUP;
1768   if (afd_events & AFD_POLL_CONNECT_FAIL)
1769     /* Linux reports all these events after connect() has failed. */
1770     epoll_events |=
1771         EPOLLIN | EPOLLOUT | EPOLLERR | EPOLLRDNORM | EPOLLWRNORM | EPOLLRDHUP;
1772
1773   return epoll_events;
1774 }
1775
1776 int sock_update(port_state_t* port_state, sock_state_t* sock_state) {
1777   assert(!sock_state->delete_pending);
1778
1779   if ((sock_state->poll_status == SOCK__POLL_PENDING) &&
1780       (sock_state->user_events & SOCK__KNOWN_EPOLL_EVENTS &
1781        ~sock_state->pending_events) == 0) {
1782     /* All the events the user is interested in are already being monitored by
1783      * the pending poll operation. It might spuriously complete because of an
1784      * event that we're no longer interested in; when that happens we'll submit
1785      * a new poll operation with the updated event mask. */
1786
1787   } else if (sock_state->poll_status == SOCK__POLL_PENDING) {
1788     /* A poll operation is already pending, but it's not monitoring for all the
1789      * events that the user is interested in. Therefore, cancel the pending
1790      * poll operation; when we receive it's completion package, a new poll
1791      * operation will be submitted with the correct event mask. */
1792     if (sock__cancel_poll(sock_state) < 0)
1793       return -1;
1794
1795   } else if (sock_state->poll_status == SOCK__POLL_CANCELLED) {
1796     /* The poll operation has already been cancelled, we're still waiting for
1797      * it to return. For now, there's nothing that needs to be done. */
1798
1799   } else if (sock_state->poll_status == SOCK__POLL_IDLE) {
1800     /* No poll operation is pending; start one. */
1801     sock_state->poll_info.Exclusive = FALSE;
1802     sock_state->poll_info.NumberOfHandles = 1;
1803     sock_state->poll_info.Timeout.QuadPart = INT64_MAX;
1804     sock_state->poll_info.Handles[0].Handle = (HANDLE) sock_state->base_socket;
1805     sock_state->poll_info.Handles[0].Status = 0;
1806     sock_state->poll_info.Handles[0].Events =
1807         sock__epoll_events_to_afd_events(sock_state->user_events);
1808
1809     if (afd_poll(poll_group_get_afd_helper_handle(sock_state->poll_group),
1810                  &sock_state->poll_info,
1811                  &sock_state->io_status_block) < 0) {
1812       switch (GetLastError()) {
1813         case ERROR_IO_PENDING:
1814           /* Overlapped poll operation in progress; this is expected. */
1815           break;
1816         case ERROR_INVALID_HANDLE:
1817           /* Socket closed; it'll be dropped from the epoll set. */
1818           return sock__delete(port_state, sock_state, false);
1819         default:
1820           /* Other errors are propagated to the caller. */
1821           return_map_error(-1);
1822       }
1823     }
1824
1825     /* The poll request was successfully submitted. */
1826     sock_state->poll_status = SOCK__POLL_PENDING;
1827     sock_state->pending_events = sock_state->user_events;
1828
1829   } else {
1830     /* Unreachable. */
1831     assert(false);
1832   }
1833
1834   port_cancel_socket_update(port_state, sock_state);
1835   return 0;
1836 }
1837
1838 int sock_feed_event(port_state_t* port_state,
1839                     IO_STATUS_BLOCK* io_status_block,
1840                     struct epoll_event* ev) {
1841   sock_state_t* sock_state =
1842       container_of(io_status_block, sock_state_t, io_status_block);
1843   AFD_POLL_INFO* poll_info = &sock_state->poll_info;
1844   uint32_t epoll_events = 0;
1845
1846   sock_state->poll_status = SOCK__POLL_IDLE;
1847   sock_state->pending_events = 0;
1848
1849   if (sock_state->delete_pending) {
1850     /* Socket has been deleted earlier and can now be freed. */
1851     return sock__delete(port_state, sock_state, false);
1852
1853   } else if (io_status_block->Status == STATUS_CANCELLED) {
1854     /* The poll request was cancelled by CancelIoEx. */
1855
1856   } else if (!NT_SUCCESS(io_status_block->Status)) {
1857     /* The overlapped request itself failed in an unexpected way. */
1858     epoll_events = EPOLLERR;
1859
1860   } else if (poll_info->NumberOfHandles < 1) {
1861     /* This poll operation succeeded but didn't report any socket events. */
1862
1863   } else if (poll_info->Handles[0].Events & AFD_POLL_LOCAL_CLOSE) {
1864     /* The poll operation reported that the socket was closed. */
1865     return sock__delete(port_state, sock_state, false);
1866
1867   } else {
1868     /* Events related to our socket were reported. */
1869     epoll_events =
1870         sock__afd_events_to_epoll_events(poll_info->Handles[0].Events);
1871   }
1872
1873   /* Requeue the socket so a new poll request will be submitted. */
1874   port_request_socket_update(port_state, sock_state);
1875
1876   /* Filter out events that the user didn't ask for. */
1877   epoll_events &= sock_state->user_events;
1878
1879   /* Return if there are no epoll events to report. */
1880   if (epoll_events == 0)
1881     return 0;
1882
1883   /* If the the socket has the EPOLLONESHOT flag set, unmonitor all events,
1884    * even EPOLLERR and EPOLLHUP. But always keep looking for closed sockets. */
1885   if (sock_state->user_events & EPOLLONESHOT)
1886     sock_state->user_events = 0;
1887
1888   ev->data = sock_state->user_data;
1889   ev->events = epoll_events;
1890   return 1;
1891 }
1892
1893 sock_state_t* sock_state_from_queue_node(queue_node_t* queue_node) {
1894   return container_of(queue_node, sock_state_t, queue_node);
1895 }
1896
1897 queue_node_t* sock_state_to_queue_node(sock_state_t* sock_state) {
1898   return &sock_state->queue_node;
1899 }
1900
1901 sock_state_t* sock_state_from_tree_node(tree_node_t* tree_node) {
1902   return container_of(tree_node, sock_state_t, tree_node);
1903 }
1904
1905 tree_node_t* sock_state_to_tree_node(sock_state_t* sock_state) {
1906   return &sock_state->tree_node;
1907 }
1908
1909 void ts_tree_init(ts_tree_t* ts_tree) {
1910   tree_init(&ts_tree->tree);
1911   InitializeSRWLock(&ts_tree->lock);
1912 }
1913
1914 void ts_tree_node_init(ts_tree_node_t* node) {
1915   tree_node_init(&node->tree_node);
1916   reflock_init(&node->reflock);
1917 }
1918
1919 int ts_tree_add(ts_tree_t* ts_tree, ts_tree_node_t* node, uintptr_t key) {
1920   int r;
1921
1922   AcquireSRWLockExclusive(&ts_tree->lock);
1923   r = tree_add(&ts_tree->tree, &node->tree_node, key);
1924   ReleaseSRWLockExclusive(&ts_tree->lock);
1925
1926   return r;
1927 }
1928
1929 static inline ts_tree_node_t* ts_tree__find_node(ts_tree_t* ts_tree,
1930                                                  uintptr_t key) {
1931   tree_node_t* tree_node = tree_find(&ts_tree->tree, key);
1932   if (tree_node == NULL)
1933     return NULL;
1934
1935   return container_of(tree_node, ts_tree_node_t, tree_node);
1936 }
1937
1938 ts_tree_node_t* ts_tree_del_and_ref(ts_tree_t* ts_tree, uintptr_t key) {
1939   ts_tree_node_t* ts_tree_node;
1940
1941   AcquireSRWLockExclusive(&ts_tree->lock);
1942
1943   ts_tree_node = ts_tree__find_node(ts_tree, key);
1944   if (ts_tree_node != NULL) {
1945     tree_del(&ts_tree->tree, &ts_tree_node->tree_node);
1946     reflock_ref(&ts_tree_node->reflock);
1947   }
1948
1949   ReleaseSRWLockExclusive(&ts_tree->lock);
1950
1951   return ts_tree_node;
1952 }
1953
1954 ts_tree_node_t* ts_tree_find_and_ref(ts_tree_t* ts_tree, uintptr_t key) {
1955   ts_tree_node_t* ts_tree_node;
1956
1957   AcquireSRWLockShared(&ts_tree->lock);
1958
1959   ts_tree_node = ts_tree__find_node(ts_tree, key);
1960   if (ts_tree_node != NULL)
1961     reflock_ref(&ts_tree_node->reflock);
1962
1963   ReleaseSRWLockShared(&ts_tree->lock);
1964
1965   return ts_tree_node;
1966 }
1967
1968 void ts_tree_node_unref(ts_tree_node_t* node) {
1969   reflock_unref(&node->reflock);
1970 }
1971
1972 void ts_tree_node_unref_and_destroy(ts_tree_node_t* node) {
1973   reflock_unref_and_destroy(&node->reflock);
1974 }
1975
1976 void tree_init(tree_t* tree) {
1977   memset(tree, 0, sizeof *tree);
1978 }
1979
1980 void tree_node_init(tree_node_t* node) {
1981   memset(node, 0, sizeof *node);
1982 }
1983
1984 #define TREE__ROTATE(cis, trans)   \
1985   tree_node_t* p = node;           \
1986   tree_node_t* q = node->trans;    \
1987   tree_node_t* parent = p->parent; \
1988                                    \
1989   if (parent) {                    \
1990     if (parent->left == p)         \
1991       parent->left = q;            \
1992     else                           \
1993       parent->right = q;           \
1994   } else {                         \
1995     tree->root = q;                \
1996   }                                \
1997                                    \
1998   q->parent = parent;              \
1999   p->parent = q;                   \
2000   p->trans = q->cis;               \
2001   if (p->trans)                    \
2002     p->trans->parent = p;          \
2003   q->cis = p;
2004
2005 static inline void tree__rotate_left(tree_t* tree, tree_node_t* node) {
2006   TREE__ROTATE(left, right)
2007 }
2008
2009 static inline void tree__rotate_right(tree_t* tree, tree_node_t* node) {
2010   TREE__ROTATE(right, left)
2011 }
2012
2013 #define TREE__INSERT_OR_DESCEND(side) \
2014   if (parent->side) {                 \
2015     parent = parent->side;            \
2016   } else {                            \
2017     parent->side = node;              \
2018     break;                            \
2019   }
2020
2021 #define TREE__REBALANCE_AFTER_INSERT(cis, trans) \
2022   tree_node_t* grandparent = parent->parent;     \
2023   tree_node_t* uncle = grandparent->trans;       \
2024                                                  \
2025   if (uncle && uncle->red) {                     \
2026     parent->red = uncle->red = false;            \
2027     grandparent->red = true;                     \
2028     node = grandparent;                          \
2029   } else {                                       \
2030     if (node == parent->trans) {                 \
2031       tree__rotate_##cis(tree, parent);          \
2032       node = parent;                             \
2033       parent = node->parent;                     \
2034     }                                            \
2035     parent->red = false;                         \
2036     grandparent->red = true;                     \
2037     tree__rotate_##trans(tree, grandparent);     \
2038   }
2039
2040 int tree_add(tree_t* tree, tree_node_t* node, uintptr_t key) {
2041   tree_node_t* parent;
2042
2043   parent = tree->root;
2044   if (parent) {
2045     for (;;) {
2046       if (key < parent->key) {
2047         TREE__INSERT_OR_DESCEND(left)
2048       } else if (key > parent->key) {
2049         TREE__INSERT_OR_DESCEND(right)
2050       } else {
2051         return -1;
2052       }
2053     }
2054   } else {
2055     tree->root = node;
2056   }
2057
2058   node->key = key;
2059   node->left = node->right = NULL;
2060   node->parent = parent;
2061   node->red = true;
2062
2063   for (; parent && parent->red; parent = node->parent) {
2064     if (parent == parent->parent->left) {
2065       TREE__REBALANCE_AFTER_INSERT(left, right)
2066     } else {
2067       TREE__REBALANCE_AFTER_INSERT(right, left)
2068     }
2069   }
2070   tree->root->red = false;
2071
2072   return 0;
2073 }
2074
2075 #define TREE__REBALANCE_AFTER_REMOVE(cis, trans)   \
2076   tree_node_t* sibling = parent->trans;            \
2077                                                    \
2078   if (sibling->red) {                              \
2079     sibling->red = false;                          \
2080     parent->red = true;                            \
2081     tree__rotate_##cis(tree, parent);              \
2082     sibling = parent->trans;                       \
2083   }                                                \
2084   if ((sibling->left && sibling->left->red) ||     \
2085       (sibling->right && sibling->right->red)) {   \
2086     if (!sibling->trans || !sibling->trans->red) { \
2087       sibling->cis->red = false;                   \
2088       sibling->red = true;                         \
2089       tree__rotate_##trans(tree, sibling);         \
2090       sibling = parent->trans;                     \
2091     }                                              \
2092     sibling->red = parent->red;                    \
2093     parent->red = sibling->trans->red = false;     \
2094     tree__rotate_##cis(tree, parent);              \
2095     node = tree->root;                             \
2096     break;                                         \
2097   }                                                \
2098   sibling->red = true;
2099
2100 void tree_del(tree_t* tree, tree_node_t* node) {
2101   tree_node_t* parent = node->parent;
2102   tree_node_t* left = node->left;
2103   tree_node_t* right = node->right;
2104   tree_node_t* next;
2105   bool red;
2106
2107   if (!left) {
2108     next = right;
2109   } else if (!right) {
2110     next = left;
2111   } else {
2112     next = right;
2113     while (next->left)
2114       next = next->left;
2115   }
2116
2117   if (parent) {
2118     if (parent->left == node)
2119       parent->left = next;
2120     else
2121       parent->right = next;
2122   } else {
2123     tree->root = next;
2124   }
2125
2126   if (left && right) {
2127     red = next->red;
2128     next->red = node->red;
2129     next->left = left;
2130     left->parent = next;
2131     if (next != right) {
2132       parent = next->parent;
2133       next->parent = node->parent;
2134       node = next->right;
2135       parent->left = node;
2136       next->right = right;
2137       right->parent = next;
2138     } else {
2139       next->parent = parent;
2140       parent = next;
2141       node = next->right;
2142     }
2143   } else {
2144     red = node->red;
2145     node = next;
2146   }
2147
2148   if (node)
2149     node->parent = parent;
2150   if (red)
2151     return;
2152   if (node && node->red) {
2153     node->red = false;
2154     return;
2155   }
2156
2157   do {
2158     if (node == tree->root)
2159       break;
2160     if (node == parent->left) {
2161       TREE__REBALANCE_AFTER_REMOVE(left, right)
2162     } else {
2163       TREE__REBALANCE_AFTER_REMOVE(right, left)
2164     }
2165     node = parent;
2166     parent = parent->parent;
2167   } while (!node->red);
2168
2169   if (node)
2170     node->red = false;
2171 }
2172
2173 tree_node_t* tree_find(const tree_t* tree, uintptr_t key) {
2174   tree_node_t* node = tree->root;
2175   while (node) {
2176     if (key < node->key)
2177       node = node->left;
2178     else if (key > node->key)
2179       node = node->right;
2180     else
2181       return node;
2182   }
2183   return NULL;
2184 }
2185
2186 tree_node_t* tree_root(const tree_t* tree) {
2187   return tree->root;
2188 }
2189
2190 #ifndef SIO_BASE_HANDLE
2191 #define SIO_BASE_HANDLE 0x48000022
2192 #endif
2193
2194 int ws_global_init(void) {
2195   int r;
2196   WSADATA wsa_data;
2197
2198   r = WSAStartup(MAKEWORD(2, 2), &wsa_data);
2199   if (r != 0)
2200     return_set_error(-1, (DWORD) r);
2201
2202   return 0;
2203 }
2204
2205 SOCKET ws_get_base_socket(SOCKET socket) {
2206   SOCKET base_socket;
2207   DWORD bytes;
2208
2209   if (WSAIoctl(socket,
2210                SIO_BASE_HANDLE,
2211                NULL,
2212                0,
2213                &base_socket,
2214                sizeof base_socket,
2215                &bytes,
2216                NULL,
2217                NULL) == SOCKET_ERROR)
2218     return_map_error(INVALID_SOCKET);
2219
2220   return base_socket;
2221 }