2 * wepoll - epoll for Windows
3 * https://github.com/piscisaureus/wepoll
5 * Copyright 2012-2020, Bert Belder <bertbelder@gmail.com>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are
12 * * Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
15 * * Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 EPOLLIN = (int) (1U << 0),
38 EPOLLPRI = (int) (1U << 1),
39 EPOLLOUT = (int) (1U << 2),
40 EPOLLERR = (int) (1U << 3),
41 EPOLLHUP = (int) (1U << 4),
42 EPOLLRDNORM = (int) (1U << 6),
43 EPOLLRDBAND = (int) (1U << 7),
44 EPOLLWRNORM = (int) (1U << 8),
45 EPOLLWRBAND = (int) (1U << 9),
46 EPOLLMSG = (int) (1U << 10), /* Never reported. */
47 EPOLLRDHUP = (int) (1U << 13),
48 EPOLLONESHOT = (int) (1U << 31)
51 #define EPOLLIN (1U << 0)
52 #define EPOLLPRI (1U << 1)
53 #define EPOLLOUT (1U << 2)
54 #define EPOLLERR (1U << 3)
55 #define EPOLLHUP (1U << 4)
56 #define EPOLLRDNORM (1U << 6)
57 #define EPOLLRDBAND (1U << 7)
58 #define EPOLLWRNORM (1U << 8)
59 #define EPOLLWRBAND (1U << 9)
60 #define EPOLLMSG (1U << 10)
61 #define EPOLLRDHUP (1U << 13)
62 #define EPOLLONESHOT (1U << 31)
64 #define EPOLL_CTL_ADD 1
65 #define EPOLL_CTL_MOD 2
66 #define EPOLL_CTL_DEL 3
69 typedef uintptr_t SOCKET;
71 typedef union epoll_data {
76 SOCKET sock; /* Windows specific */
77 HANDLE hnd; /* Windows specific */
81 uint32_t events; /* Epoll events and flags */
82 epoll_data_t data; /* User data variable */
89 WEPOLL_EXPORT HANDLE epoll_create(int size);
90 WEPOLL_EXPORT HANDLE epoll_create1(int flags);
92 WEPOLL_EXPORT int epoll_close(HANDLE ephnd);
94 WEPOLL_EXPORT int epoll_ctl(HANDLE ephnd,
97 struct epoll_event* event);
99 WEPOLL_EXPORT int epoll_wait(HANDLE ephnd,
100 struct epoll_event* events,
112 #define WEPOLL_INTERNAL static
113 #define WEPOLL_INTERNAL_VAR static
115 #ifndef WIN32_LEAN_AND_MEAN
116 #define WIN32_LEAN_AND_MEAN
120 #pragma clang diagnostic push
121 #pragma clang diagnostic ignored "-Wreserved-id-macro"
128 #define _WIN32_WINNT 0x0600
131 #pragma clang diagnostic pop
135 #pragma warning(push, 1)
138 #include <ws2tcpip.h>
139 #include <winsock2.h>
146 WEPOLL_INTERNAL int nt_global_init(void);
148 typedef LONG NTSTATUS;
149 typedef NTSTATUS* PNTSTATUS;
152 #define NT_SUCCESS(status) (((NTSTATUS)(status)) >= 0)
155 #ifndef STATUS_SUCCESS
156 #define STATUS_SUCCESS ((NTSTATUS) 0x00000000L)
159 #ifndef STATUS_PENDING
160 #define STATUS_PENDING ((NTSTATUS) 0x00000103L)
163 #ifndef STATUS_CANCELLED
164 #define STATUS_CANCELLED ((NTSTATUS) 0xC0000120L)
167 #ifndef STATUS_NOT_FOUND
168 #define STATUS_NOT_FOUND ((NTSTATUS) 0xC0000225L)
171 typedef struct _IO_STATUS_BLOCK {
173 ULONG_PTR Information;
174 } IO_STATUS_BLOCK, *PIO_STATUS_BLOCK;
176 typedef VOID(NTAPI* PIO_APC_ROUTINE)(PVOID ApcContext,
177 PIO_STATUS_BLOCK IoStatusBlock,
180 typedef struct _UNICODE_STRING {
182 USHORT MaximumLength;
184 } UNICODE_STRING, *PUNICODE_STRING;
186 #define RTL_CONSTANT_STRING(s) \
187 { sizeof(s) - sizeof((s)[0]), sizeof(s), s }
189 typedef struct _OBJECT_ATTRIBUTES {
191 HANDLE RootDirectory;
192 PUNICODE_STRING ObjectName;
194 PVOID SecurityDescriptor;
195 PVOID SecurityQualityOfService;
196 } OBJECT_ATTRIBUTES, *POBJECT_ATTRIBUTES;
198 #define RTL_CONSTANT_OBJECT_ATTRIBUTES(ObjectName, Attributes) \
199 { sizeof(OBJECT_ATTRIBUTES), NULL, ObjectName, Attributes, NULL, NULL }
202 #define FILE_OPEN 0x00000001UL
205 #define KEYEDEVENT_WAIT 0x00000001UL
206 #define KEYEDEVENT_WAKE 0x00000002UL
207 #define KEYEDEVENT_ALL_ACCESS \
208 (STANDARD_RIGHTS_REQUIRED | KEYEDEVENT_WAIT | KEYEDEVENT_WAKE)
210 #define NT_NTDLL_IMPORT_LIST(X) \
214 (HANDLE FileHandle, \
215 PIO_STATUS_BLOCK IoRequestToCancel, \
216 PIO_STATUS_BLOCK IoStatusBlock)) \
221 (PHANDLE FileHandle, \
222 ACCESS_MASK DesiredAccess, \
223 POBJECT_ATTRIBUTES ObjectAttributes, \
224 PIO_STATUS_BLOCK IoStatusBlock, \
225 PLARGE_INTEGER AllocationSize, \
226 ULONG FileAttributes, \
228 ULONG CreateDisposition, \
229 ULONG CreateOptions, \
235 NtCreateKeyedEvent, \
236 (PHANDLE KeyedEventHandle, \
237 ACCESS_MASK DesiredAccess, \
238 POBJECT_ATTRIBUTES ObjectAttributes, \
243 NtDeviceIoControlFile, \
244 (HANDLE FileHandle, \
246 PIO_APC_ROUTINE ApcRoutine, \
248 PIO_STATUS_BLOCK IoStatusBlock, \
249 ULONG IoControlCode, \
251 ULONG InputBufferLength, \
252 PVOID OutputBuffer, \
253 ULONG OutputBufferLength)) \
257 NtReleaseKeyedEvent, \
258 (HANDLE KeyedEventHandle, \
261 PLARGE_INTEGER Timeout)) \
265 NtWaitForKeyedEvent, \
266 (HANDLE KeyedEventHandle, \
269 PLARGE_INTEGER Timeout)) \
271 X(ULONG, WINAPI, RtlNtStatusToDosError, (NTSTATUS Status))
273 #define X(return_type, attributes, name, parameters) \
274 WEPOLL_INTERNAL_VAR return_type(attributes* name) parameters;
275 NT_NTDLL_IMPORT_LIST(X)
278 #define AFD_POLL_RECEIVE 0x0001
279 #define AFD_POLL_RECEIVE_EXPEDITED 0x0002
280 #define AFD_POLL_SEND 0x0004
281 #define AFD_POLL_DISCONNECT 0x0008
282 #define AFD_POLL_ABORT 0x0010
283 #define AFD_POLL_LOCAL_CLOSE 0x0020
284 #define AFD_POLL_ACCEPT 0x0080
285 #define AFD_POLL_CONNECT_FAIL 0x0100
287 typedef struct _AFD_POLL_HANDLE_INFO {
291 } AFD_POLL_HANDLE_INFO, *PAFD_POLL_HANDLE_INFO;
293 typedef struct _AFD_POLL_INFO {
294 LARGE_INTEGER Timeout;
295 ULONG NumberOfHandles;
297 AFD_POLL_HANDLE_INFO Handles[1];
298 } AFD_POLL_INFO, *PAFD_POLL_INFO;
300 WEPOLL_INTERNAL int afd_create_helper_handle(HANDLE iocp_handle,
301 HANDLE* afd_helper_handle_out);
303 WEPOLL_INTERNAL int afd_poll(HANDLE afd_helper_handle,
304 AFD_POLL_INFO* poll_info,
305 IO_STATUS_BLOCK* io_status_block);
306 WEPOLL_INTERNAL int afd_cancel_poll(HANDLE afd_helper_handle,
307 IO_STATUS_BLOCK* io_status_block);
309 #define return_map_error(value) \
311 err_map_win_error(); \
315 #define return_set_error(value, error) \
317 err_set_win_error(error); \
321 WEPOLL_INTERNAL void err_map_win_error(void);
322 WEPOLL_INTERNAL void err_set_win_error(DWORD error);
323 WEPOLL_INTERNAL int err_check_handle(HANDLE handle);
325 #define IOCTL_AFD_POLL 0x00012024
327 static UNICODE_STRING afd__helper_name =
328 RTL_CONSTANT_STRING(L"\\Device\\Afd\\Wepoll");
330 static OBJECT_ATTRIBUTES afd__helper_attributes =
331 RTL_CONSTANT_OBJECT_ATTRIBUTES(&afd__helper_name, 0);
333 int afd_create_helper_handle(HANDLE iocp_handle,
334 HANDLE* afd_helper_handle_out) {
335 HANDLE afd_helper_handle;
336 IO_STATUS_BLOCK iosb;
339 /* By opening \Device\Afd without specifying any extended attributes, we'll
340 * get a handle that lets us talk to the AFD driver, but that doesn't have an
341 * associated endpoint (so it's not a socket). */
342 status = NtCreateFile(&afd_helper_handle,
344 &afd__helper_attributes,
348 FILE_SHARE_READ | FILE_SHARE_WRITE,
353 if (status != STATUS_SUCCESS)
354 return_set_error(-1, RtlNtStatusToDosError(status));
356 if (CreateIoCompletionPort(afd_helper_handle, iocp_handle, 0, 0) == NULL)
359 if (!SetFileCompletionNotificationModes(afd_helper_handle,
360 FILE_SKIP_SET_EVENT_ON_HANDLE))
363 *afd_helper_handle_out = afd_helper_handle;
367 CloseHandle(afd_helper_handle);
368 return_map_error(-1);
371 int afd_poll(HANDLE afd_helper_handle,
372 AFD_POLL_INFO* poll_info,
373 IO_STATUS_BLOCK* io_status_block) {
376 /* Blocking operation is not supported. */
377 assert(io_status_block != NULL);
379 io_status_block->Status = STATUS_PENDING;
380 status = NtDeviceIoControlFile(afd_helper_handle,
391 if (status == STATUS_SUCCESS)
393 else if (status == STATUS_PENDING)
394 return_set_error(-1, ERROR_IO_PENDING);
396 return_set_error(-1, RtlNtStatusToDosError(status));
399 int afd_cancel_poll(HANDLE afd_helper_handle,
400 IO_STATUS_BLOCK* io_status_block) {
401 NTSTATUS cancel_status;
402 IO_STATUS_BLOCK cancel_iosb;
404 /* If the poll operation has already completed or has been cancelled earlier,
405 * there's nothing left for us to do. */
406 if (io_status_block->Status != STATUS_PENDING)
410 NtCancelIoFileEx(afd_helper_handle, io_status_block, &cancel_iosb);
412 /* NtCancelIoFileEx() may return STATUS_NOT_FOUND if the operation completed
413 * just before calling NtCancelIoFileEx(). This is not an error. */
414 if (cancel_status == STATUS_SUCCESS || cancel_status == STATUS_NOT_FOUND)
417 return_set_error(-1, RtlNtStatusToDosError(cancel_status));
420 WEPOLL_INTERNAL int epoll_global_init(void);
422 WEPOLL_INTERNAL int init(void);
424 typedef struct port_state port_state_t;
425 typedef struct queue queue_t;
426 typedef struct sock_state sock_state_t;
427 typedef struct ts_tree_node ts_tree_node_t;
429 WEPOLL_INTERNAL port_state_t* port_new(HANDLE* iocp_handle_out);
430 WEPOLL_INTERNAL int port_close(port_state_t* port_state);
431 WEPOLL_INTERNAL int port_delete(port_state_t* port_state);
433 WEPOLL_INTERNAL int port_wait(port_state_t* port_state,
434 struct epoll_event* events,
438 WEPOLL_INTERNAL int port_ctl(port_state_t* port_state,
441 struct epoll_event* ev);
443 WEPOLL_INTERNAL int port_register_socket_handle(port_state_t* port_state,
444 sock_state_t* sock_state,
446 WEPOLL_INTERNAL void port_unregister_socket_handle(port_state_t* port_state,
447 sock_state_t* sock_state);
448 WEPOLL_INTERNAL sock_state_t* port_find_socket(port_state_t* port_state,
451 WEPOLL_INTERNAL void port_request_socket_update(port_state_t* port_state,
452 sock_state_t* sock_state);
453 WEPOLL_INTERNAL void port_cancel_socket_update(port_state_t* port_state,
454 sock_state_t* sock_state);
456 WEPOLL_INTERNAL void port_add_deleted_socket(port_state_t* port_state,
457 sock_state_t* sock_state);
458 WEPOLL_INTERNAL void port_remove_deleted_socket(port_state_t* port_state,
459 sock_state_t* sock_state);
461 WEPOLL_INTERNAL HANDLE port_get_iocp_handle(port_state_t* port_state);
462 WEPOLL_INTERNAL queue_t* port_get_poll_group_queue(port_state_t* port_state);
464 WEPOLL_INTERNAL port_state_t* port_state_from_handle_tree_node(
465 ts_tree_node_t* tree_node);
466 WEPOLL_INTERNAL ts_tree_node_t* port_state_to_handle_tree_node(
467 port_state_t* port_state);
469 /* The reflock is a special kind of lock that normally prevents a chunk of
470 * memory from being freed, but does allow the chunk of memory to eventually be
471 * released in a coordinated fashion.
473 * Under normal operation, threads increase and decrease the reference count,
474 * which are wait-free operations.
476 * Exactly once during the reflock's lifecycle, a thread holding a reference to
477 * the lock may "destroy" the lock; this operation blocks until all other
478 * threads holding a reference to the lock have dereferenced it. After
479 * "destroy" returns, the calling thread may assume that no other threads have
480 * a reference to the lock.
482 * Attemmpting to lock or destroy a lock after reflock_unref_and_destroy() has
483 * been called is invalid and results in undefined behavior. Therefore the user
484 * should use another lock to guarantee that this can't happen.
487 typedef struct reflock {
488 volatile long state; /* 32-bit Interlocked APIs operate on `long` values. */
491 WEPOLL_INTERNAL int reflock_global_init(void);
493 WEPOLL_INTERNAL void reflock_init(reflock_t* reflock);
494 WEPOLL_INTERNAL void reflock_ref(reflock_t* reflock);
495 WEPOLL_INTERNAL void reflock_unref(reflock_t* reflock);
496 WEPOLL_INTERNAL void reflock_unref_and_destroy(reflock_t* reflock);
500 /* N.b.: the tree functions do not set errno or LastError when they fail. Each
501 * of the API functions has at most one failure mode. It is up to the caller to
502 * set an appropriate error code when necessary. */
504 typedef struct tree tree_t;
505 typedef struct tree_node tree_node_t;
507 typedef struct tree {
511 typedef struct tree_node {
519 WEPOLL_INTERNAL void tree_init(tree_t* tree);
520 WEPOLL_INTERNAL void tree_node_init(tree_node_t* node);
522 WEPOLL_INTERNAL int tree_add(tree_t* tree, tree_node_t* node, uintptr_t key);
523 WEPOLL_INTERNAL void tree_del(tree_t* tree, tree_node_t* node);
525 WEPOLL_INTERNAL tree_node_t* tree_find(const tree_t* tree, uintptr_t key);
526 WEPOLL_INTERNAL tree_node_t* tree_root(const tree_t* tree);
528 typedef struct ts_tree {
533 typedef struct ts_tree_node {
534 tree_node_t tree_node;
538 WEPOLL_INTERNAL void ts_tree_init(ts_tree_t* rtl);
539 WEPOLL_INTERNAL void ts_tree_node_init(ts_tree_node_t* node);
541 WEPOLL_INTERNAL int ts_tree_add(ts_tree_t* ts_tree,
542 ts_tree_node_t* node,
545 WEPOLL_INTERNAL ts_tree_node_t* ts_tree_del_and_ref(ts_tree_t* ts_tree,
547 WEPOLL_INTERNAL ts_tree_node_t* ts_tree_find_and_ref(ts_tree_t* ts_tree,
550 WEPOLL_INTERNAL void ts_tree_node_unref(ts_tree_node_t* node);
551 WEPOLL_INTERNAL void ts_tree_node_unref_and_destroy(ts_tree_node_t* node);
553 static ts_tree_t epoll__handle_tree;
555 int epoll_global_init(void) {
556 ts_tree_init(&epoll__handle_tree);
560 static HANDLE epoll__create(void) {
561 port_state_t* port_state;
563 ts_tree_node_t* tree_node;
568 port_state = port_new(&ephnd);
569 if (port_state == NULL)
572 tree_node = port_state_to_handle_tree_node(port_state);
573 if (ts_tree_add(&epoll__handle_tree, tree_node, (uintptr_t) ephnd) < 0) {
574 /* This should never happen. */
575 port_delete(port_state);
576 return_set_error(NULL, ERROR_ALREADY_EXISTS);
582 HANDLE epoll_create(int size) {
584 return_set_error(NULL, ERROR_INVALID_PARAMETER);
586 return epoll__create();
589 HANDLE epoll_create1(int flags) {
591 return_set_error(NULL, ERROR_INVALID_PARAMETER);
593 return epoll__create();
596 int epoll_close(HANDLE ephnd) {
597 ts_tree_node_t* tree_node;
598 port_state_t* port_state;
603 tree_node = ts_tree_del_and_ref(&epoll__handle_tree, (uintptr_t) ephnd);
604 if (tree_node == NULL) {
605 err_set_win_error(ERROR_INVALID_PARAMETER);
609 port_state = port_state_from_handle_tree_node(tree_node);
610 port_close(port_state);
612 ts_tree_node_unref_and_destroy(tree_node);
614 return port_delete(port_state);
617 err_check_handle(ephnd);
621 int epoll_ctl(HANDLE ephnd, int op, SOCKET sock, struct epoll_event* ev) {
622 ts_tree_node_t* tree_node;
623 port_state_t* port_state;
629 tree_node = ts_tree_find_and_ref(&epoll__handle_tree, (uintptr_t) ephnd);
630 if (tree_node == NULL) {
631 err_set_win_error(ERROR_INVALID_PARAMETER);
635 port_state = port_state_from_handle_tree_node(tree_node);
636 r = port_ctl(port_state, op, sock, ev);
638 ts_tree_node_unref(tree_node);
646 /* On Linux, in the case of epoll_ctl(), EBADF takes priority over other
647 * errors. Wepoll mimics this behavior. */
648 err_check_handle(ephnd);
649 err_check_handle((HANDLE) sock);
653 int epoll_wait(HANDLE ephnd,
654 struct epoll_event* events,
657 ts_tree_node_t* tree_node;
658 port_state_t* port_state;
662 return_set_error(-1, ERROR_INVALID_PARAMETER);
667 tree_node = ts_tree_find_and_ref(&epoll__handle_tree, (uintptr_t) ephnd);
668 if (tree_node == NULL) {
669 err_set_win_error(ERROR_INVALID_PARAMETER);
673 port_state = port_state_from_handle_tree_node(tree_node);
674 num_events = port_wait(port_state, events, maxevents, timeout);
676 ts_tree_node_unref(tree_node);
684 err_check_handle(ephnd);
690 #define ERR__ERRNO_MAPPINGS(X) \
691 X(ERROR_ACCESS_DENIED, EACCES) \
692 X(ERROR_ALREADY_EXISTS, EEXIST) \
693 X(ERROR_BAD_COMMAND, EACCES) \
694 X(ERROR_BAD_EXE_FORMAT, ENOEXEC) \
695 X(ERROR_BAD_LENGTH, EACCES) \
696 X(ERROR_BAD_NETPATH, ENOENT) \
697 X(ERROR_BAD_NET_NAME, ENOENT) \
698 X(ERROR_BAD_NET_RESP, ENETDOWN) \
699 X(ERROR_BAD_PATHNAME, ENOENT) \
700 X(ERROR_BROKEN_PIPE, EPIPE) \
701 X(ERROR_CANNOT_MAKE, EACCES) \
702 X(ERROR_COMMITMENT_LIMIT, ENOMEM) \
703 X(ERROR_CONNECTION_ABORTED, ECONNABORTED) \
704 X(ERROR_CONNECTION_ACTIVE, EISCONN) \
705 X(ERROR_CONNECTION_REFUSED, ECONNREFUSED) \
706 X(ERROR_CRC, EACCES) \
707 X(ERROR_DIR_NOT_EMPTY, ENOTEMPTY) \
708 X(ERROR_DISK_FULL, ENOSPC) \
709 X(ERROR_DUP_NAME, EADDRINUSE) \
710 X(ERROR_FILENAME_EXCED_RANGE, ENOENT) \
711 X(ERROR_FILE_NOT_FOUND, ENOENT) \
712 X(ERROR_GEN_FAILURE, EACCES) \
713 X(ERROR_GRACEFUL_DISCONNECT, EPIPE) \
714 X(ERROR_HOST_DOWN, EHOSTUNREACH) \
715 X(ERROR_HOST_UNREACHABLE, EHOSTUNREACH) \
716 X(ERROR_INSUFFICIENT_BUFFER, EFAULT) \
717 X(ERROR_INVALID_ADDRESS, EADDRNOTAVAIL) \
718 X(ERROR_INVALID_FUNCTION, EINVAL) \
719 X(ERROR_INVALID_HANDLE, EBADF) \
720 X(ERROR_INVALID_NETNAME, EADDRNOTAVAIL) \
721 X(ERROR_INVALID_PARAMETER, EINVAL) \
722 X(ERROR_INVALID_USER_BUFFER, EMSGSIZE) \
723 X(ERROR_IO_PENDING, EINPROGRESS) \
724 X(ERROR_LOCK_VIOLATION, EACCES) \
725 X(ERROR_MORE_DATA, EMSGSIZE) \
726 X(ERROR_NETNAME_DELETED, ECONNABORTED) \
727 X(ERROR_NETWORK_ACCESS_DENIED, EACCES) \
728 X(ERROR_NETWORK_BUSY, ENETDOWN) \
729 X(ERROR_NETWORK_UNREACHABLE, ENETUNREACH) \
730 X(ERROR_NOACCESS, EFAULT) \
731 X(ERROR_NONPAGED_SYSTEM_RESOURCES, ENOMEM) \
732 X(ERROR_NOT_ENOUGH_MEMORY, ENOMEM) \
733 X(ERROR_NOT_ENOUGH_QUOTA, ENOMEM) \
734 X(ERROR_NOT_FOUND, ENOENT) \
735 X(ERROR_NOT_LOCKED, EACCES) \
736 X(ERROR_NOT_READY, EACCES) \
737 X(ERROR_NOT_SAME_DEVICE, EXDEV) \
738 X(ERROR_NOT_SUPPORTED, ENOTSUP) \
739 X(ERROR_NO_MORE_FILES, ENOENT) \
740 X(ERROR_NO_SYSTEM_RESOURCES, ENOMEM) \
741 X(ERROR_OPERATION_ABORTED, EINTR) \
742 X(ERROR_OUT_OF_PAPER, EACCES) \
743 X(ERROR_PAGED_SYSTEM_RESOURCES, ENOMEM) \
744 X(ERROR_PAGEFILE_QUOTA, ENOMEM) \
745 X(ERROR_PATH_NOT_FOUND, ENOENT) \
746 X(ERROR_PIPE_NOT_CONNECTED, EPIPE) \
747 X(ERROR_PORT_UNREACHABLE, ECONNRESET) \
748 X(ERROR_PROTOCOL_UNREACHABLE, ENETUNREACH) \
749 X(ERROR_REM_NOT_LIST, ECONNREFUSED) \
750 X(ERROR_REQUEST_ABORTED, EINTR) \
751 X(ERROR_REQ_NOT_ACCEP, EWOULDBLOCK) \
752 X(ERROR_SECTOR_NOT_FOUND, EACCES) \
753 X(ERROR_SEM_TIMEOUT, ETIMEDOUT) \
754 X(ERROR_SHARING_VIOLATION, EACCES) \
755 X(ERROR_TOO_MANY_NAMES, ENOMEM) \
756 X(ERROR_TOO_MANY_OPEN_FILES, EMFILE) \
757 X(ERROR_UNEXP_NET_ERR, ECONNABORTED) \
758 X(ERROR_WAIT_NO_CHILDREN, ECHILD) \
759 X(ERROR_WORKING_SET_QUOTA, ENOMEM) \
760 X(ERROR_WRITE_PROTECT, EACCES) \
761 X(ERROR_WRONG_DISK, EACCES) \
762 X(WSAEACCES, EACCES) \
763 X(WSAEADDRINUSE, EADDRINUSE) \
764 X(WSAEADDRNOTAVAIL, EADDRNOTAVAIL) \
765 X(WSAEAFNOSUPPORT, EAFNOSUPPORT) \
766 X(WSAECONNABORTED, ECONNABORTED) \
767 X(WSAECONNREFUSED, ECONNREFUSED) \
768 X(WSAECONNRESET, ECONNRESET) \
769 X(WSAEDISCON, EPIPE) \
770 X(WSAEFAULT, EFAULT) \
771 X(WSAEHOSTDOWN, EHOSTUNREACH) \
772 X(WSAEHOSTUNREACH, EHOSTUNREACH) \
773 X(WSAEINPROGRESS, EBUSY) \
775 X(WSAEINVAL, EINVAL) \
776 X(WSAEISCONN, EISCONN) \
777 X(WSAEMSGSIZE, EMSGSIZE) \
778 X(WSAENETDOWN, ENETDOWN) \
779 X(WSAENETRESET, EHOSTUNREACH) \
780 X(WSAENETUNREACH, ENETUNREACH) \
781 X(WSAENOBUFS, ENOMEM) \
782 X(WSAENOTCONN, ENOTCONN) \
783 X(WSAENOTSOCK, ENOTSOCK) \
784 X(WSAEOPNOTSUPP, EOPNOTSUPP) \
785 X(WSAEPROCLIM, ENOMEM) \
786 X(WSAESHUTDOWN, EPIPE) \
787 X(WSAETIMEDOUT, ETIMEDOUT) \
788 X(WSAEWOULDBLOCK, EWOULDBLOCK) \
789 X(WSANOTINITIALISED, ENETDOWN) \
790 X(WSASYSNOTREADY, ENETDOWN) \
791 X(WSAVERNOTSUPPORTED, ENOSYS)
793 static errno_t err__map_win_error_to_errno(DWORD error) {
795 #define X(error_sym, errno_sym) \
798 ERR__ERRNO_MAPPINGS(X)
804 void err_map_win_error(void) {
805 errno = err__map_win_error_to_errno(GetLastError());
808 void err_set_win_error(DWORD error) {
810 errno = err__map_win_error_to_errno(error);
813 int err_check_handle(HANDLE handle) {
816 /* GetHandleInformation() succeeds when passed INVALID_HANDLE_VALUE, so check
817 * for this condition explicitly. */
818 if (handle == INVALID_HANDLE_VALUE)
819 return_set_error(-1, ERROR_INVALID_HANDLE);
821 if (!GetHandleInformation(handle, &flags))
822 return_map_error(-1);
829 #define array_count(a) (sizeof(a) / (sizeof((a)[0])))
831 #define container_of(ptr, type, member) \
832 ((type*) ((uintptr_t) (ptr) - offsetof(type, member)))
834 #define unused_var(v) ((void) (v))
836 /* Polyfill `inline` for older versions of msvc (up to Visual Studio 2013) */
837 #if defined(_MSC_VER) && _MSC_VER < 1900
838 #define inline __inline
841 WEPOLL_INTERNAL int ws_global_init(void);
842 WEPOLL_INTERNAL SOCKET ws_get_base_socket(SOCKET socket);
844 static bool init__done = false;
845 static INIT_ONCE init__once = INIT_ONCE_STATIC_INIT;
847 static BOOL CALLBACK init__once_callback(INIT_ONCE* once,
851 unused_var(parameter);
854 /* N.b. that initialization order matters here. */
855 if (ws_global_init() < 0 || nt_global_init() < 0 ||
856 reflock_global_init() < 0 || epoll_global_init() < 0)
865 !InitOnceExecuteOnce(&init__once, init__once_callback, NULL, NULL))
866 /* `InitOnceExecuteOnce()` itself is infallible, and it doesn't set any
867 * error code when the once-callback returns FALSE. We return -1 here to
868 * indicate that global initialization failed; the failing init function is
869 * resposible for setting `errno` and calling `SetLastError()`. */
875 /* Set up a workaround for the following problem:
876 * FARPROC addr = GetProcAddress(...);
877 * MY_FUNC func = (MY_FUNC) addr; <-- GCC 8 warning/error.
878 * MY_FUNC func = (MY_FUNC) (void*) addr; <-- MSVC warning/error.
879 * To compile cleanly with either compiler, do casts with this "bridge" type:
880 * MY_FUNC func = (MY_FUNC) (nt__fn_ptr_cast_t) addr; */
882 typedef void* nt__fn_ptr_cast_t;
884 typedef FARPROC nt__fn_ptr_cast_t;
887 #define X(return_type, attributes, name, parameters) \
888 WEPOLL_INTERNAL return_type(attributes* name) parameters = NULL;
889 NT_NTDLL_IMPORT_LIST(X)
892 int nt_global_init(void) {
896 ntdll = GetModuleHandleW(L"ntdll.dll");
900 #define X(return_type, attributes, name, parameters) \
901 fn_ptr = GetProcAddress(ntdll, #name); \
902 if (fn_ptr == NULL) \
904 name = (return_type(attributes*) parameters)(nt__fn_ptr_cast_t) fn_ptr;
905 NT_NTDLL_IMPORT_LIST(X)
913 typedef struct poll_group poll_group_t;
915 typedef struct queue_node queue_node_t;
917 WEPOLL_INTERNAL poll_group_t* poll_group_acquire(port_state_t* port);
918 WEPOLL_INTERNAL void poll_group_release(poll_group_t* poll_group);
920 WEPOLL_INTERNAL void poll_group_delete(poll_group_t* poll_group);
922 WEPOLL_INTERNAL poll_group_t* poll_group_from_queue_node(
923 queue_node_t* queue_node);
924 WEPOLL_INTERNAL HANDLE
925 poll_group_get_afd_helper_handle(poll_group_t* poll_group);
927 typedef struct queue_node {
932 typedef struct queue {
936 WEPOLL_INTERNAL void queue_init(queue_t* queue);
937 WEPOLL_INTERNAL void queue_node_init(queue_node_t* node);
939 WEPOLL_INTERNAL queue_node_t* queue_first(const queue_t* queue);
940 WEPOLL_INTERNAL queue_node_t* queue_last(const queue_t* queue);
942 WEPOLL_INTERNAL void queue_prepend(queue_t* queue, queue_node_t* node);
943 WEPOLL_INTERNAL void queue_append(queue_t* queue, queue_node_t* node);
944 WEPOLL_INTERNAL void queue_move_first(queue_t* queue, queue_node_t* node);
945 WEPOLL_INTERNAL void queue_move_last(queue_t* queue, queue_node_t* node);
946 WEPOLL_INTERNAL void queue_remove(queue_node_t* node);
948 WEPOLL_INTERNAL bool queue_empty(const queue_t* queue);
949 WEPOLL_INTERNAL bool queue_enqueued(const queue_node_t* node);
951 static const size_t POLL_GROUP__MAX_GROUP_SIZE = 32;
953 typedef struct poll_group {
954 port_state_t* port_state;
955 queue_node_t queue_node;
956 HANDLE afd_helper_handle;
960 static poll_group_t* poll_group__new(port_state_t* port_state) {
961 HANDLE iocp_handle = port_get_iocp_handle(port_state);
962 queue_t* poll_group_queue = port_get_poll_group_queue(port_state);
964 poll_group_t* poll_group = malloc(sizeof *poll_group);
965 if (poll_group == NULL)
966 return_set_error(NULL, ERROR_NOT_ENOUGH_MEMORY);
968 memset(poll_group, 0, sizeof *poll_group);
970 queue_node_init(&poll_group->queue_node);
971 poll_group->port_state = port_state;
973 if (afd_create_helper_handle(iocp_handle, &poll_group->afd_helper_handle) <
979 queue_append(poll_group_queue, &poll_group->queue_node);
984 void poll_group_delete(poll_group_t* poll_group) {
985 assert(poll_group->group_size == 0);
986 CloseHandle(poll_group->afd_helper_handle);
987 queue_remove(&poll_group->queue_node);
991 poll_group_t* poll_group_from_queue_node(queue_node_t* queue_node) {
992 return container_of(queue_node, poll_group_t, queue_node);
995 HANDLE poll_group_get_afd_helper_handle(poll_group_t* poll_group) {
996 return poll_group->afd_helper_handle;
999 poll_group_t* poll_group_acquire(port_state_t* port_state) {
1000 queue_t* poll_group_queue = port_get_poll_group_queue(port_state);
1001 poll_group_t* poll_group =
1002 !queue_empty(poll_group_queue)
1004 queue_last(poll_group_queue), poll_group_t, queue_node)
1007 if (poll_group == NULL ||
1008 poll_group->group_size >= POLL_GROUP__MAX_GROUP_SIZE)
1009 poll_group = poll_group__new(port_state);
1010 if (poll_group == NULL)
1013 if (++poll_group->group_size == POLL_GROUP__MAX_GROUP_SIZE)
1014 queue_move_first(poll_group_queue, &poll_group->queue_node);
1019 void poll_group_release(poll_group_t* poll_group) {
1020 port_state_t* port_state = poll_group->port_state;
1021 queue_t* poll_group_queue = port_get_poll_group_queue(port_state);
1023 poll_group->group_size--;
1024 assert(poll_group->group_size < POLL_GROUP__MAX_GROUP_SIZE);
1026 queue_move_last(poll_group_queue, &poll_group->queue_node);
1028 /* Poll groups are currently only freed when the epoll port is closed. */
1031 WEPOLL_INTERNAL sock_state_t* sock_new(port_state_t* port_state,
1033 WEPOLL_INTERNAL void sock_delete(port_state_t* port_state,
1034 sock_state_t* sock_state);
1035 WEPOLL_INTERNAL void sock_force_delete(port_state_t* port_state,
1036 sock_state_t* sock_state);
1038 WEPOLL_INTERNAL int sock_set_event(port_state_t* port_state,
1039 sock_state_t* sock_state,
1040 const struct epoll_event* ev);
1042 WEPOLL_INTERNAL int sock_update(port_state_t* port_state,
1043 sock_state_t* sock_state);
1044 WEPOLL_INTERNAL int sock_feed_event(port_state_t* port_state,
1045 IO_STATUS_BLOCK* io_status_block,
1046 struct epoll_event* ev);
1048 WEPOLL_INTERNAL sock_state_t* sock_state_from_queue_node(
1049 queue_node_t* queue_node);
1050 WEPOLL_INTERNAL queue_node_t* sock_state_to_queue_node(
1051 sock_state_t* sock_state);
1052 WEPOLL_INTERNAL sock_state_t* sock_state_from_tree_node(
1053 tree_node_t* tree_node);
1054 WEPOLL_INTERNAL tree_node_t* sock_state_to_tree_node(sock_state_t* sock_state);
1056 #define PORT__MAX_ON_STACK_COMPLETIONS 256
1058 typedef struct port_state {
1061 queue_t sock_update_queue;
1062 queue_t sock_deleted_queue;
1063 queue_t poll_group_queue;
1064 ts_tree_node_t handle_tree_node;
1065 CRITICAL_SECTION lock;
1066 size_t active_poll_count;
1069 static port_state_t* port__alloc(void) {
1070 port_state_t* port_state = malloc(sizeof *port_state);
1071 if (port_state == NULL)
1072 return_set_error(NULL, ERROR_NOT_ENOUGH_MEMORY);
1077 static void port__free(port_state_t* port) {
1078 assert(port != NULL);
1082 static HANDLE port__create_iocp(void) {
1083 HANDLE iocp_handle =
1084 CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0);
1085 if (iocp_handle == NULL)
1086 return_map_error(NULL);
1091 port_state_t* port_new(HANDLE* iocp_handle_out) {
1092 port_state_t* port_state;
1095 port_state = port__alloc();
1096 if (port_state == NULL)
1099 iocp_handle = port__create_iocp();
1100 if (iocp_handle == NULL)
1103 memset(port_state, 0, sizeof *port_state);
1105 port_state->iocp_handle = iocp_handle;
1106 tree_init(&port_state->sock_tree);
1107 queue_init(&port_state->sock_update_queue);
1108 queue_init(&port_state->sock_deleted_queue);
1109 queue_init(&port_state->poll_group_queue);
1110 ts_tree_node_init(&port_state->handle_tree_node);
1111 InitializeCriticalSection(&port_state->lock);
1113 *iocp_handle_out = iocp_handle;
1117 port__free(port_state);
1122 static int port__close_iocp(port_state_t* port_state) {
1123 HANDLE iocp_handle = port_state->iocp_handle;
1124 port_state->iocp_handle = NULL;
1126 if (!CloseHandle(iocp_handle))
1127 return_map_error(-1);
1132 int port_close(port_state_t* port_state) {
1135 EnterCriticalSection(&port_state->lock);
1136 result = port__close_iocp(port_state);
1137 LeaveCriticalSection(&port_state->lock);
1142 int port_delete(port_state_t* port_state) {
1143 tree_node_t* tree_node;
1144 queue_node_t* queue_node;
1146 /* At this point the IOCP port should have been closed. */
1147 assert(port_state->iocp_handle == NULL);
1149 while ((tree_node = tree_root(&port_state->sock_tree)) != NULL) {
1150 sock_state_t* sock_state = sock_state_from_tree_node(tree_node);
1151 sock_force_delete(port_state, sock_state);
1154 while ((queue_node = queue_first(&port_state->sock_deleted_queue)) != NULL) {
1155 sock_state_t* sock_state = sock_state_from_queue_node(queue_node);
1156 sock_force_delete(port_state, sock_state);
1159 while ((queue_node = queue_first(&port_state->poll_group_queue)) != NULL) {
1160 poll_group_t* poll_group = poll_group_from_queue_node(queue_node);
1161 poll_group_delete(poll_group);
1164 assert(queue_empty(&port_state->sock_update_queue));
1166 DeleteCriticalSection(&port_state->lock);
1168 port__free(port_state);
1173 static int port__update_events(port_state_t* port_state) {
1174 queue_t* sock_update_queue = &port_state->sock_update_queue;
1176 /* Walk the queue, submitting new poll requests for every socket that needs
1178 while (!queue_empty(sock_update_queue)) {
1179 queue_node_t* queue_node = queue_first(sock_update_queue);
1180 sock_state_t* sock_state = sock_state_from_queue_node(queue_node);
1182 if (sock_update(port_state, sock_state) < 0)
1185 /* sock_update() removes the socket from the update queue. */
1191 static void port__update_events_if_polling(port_state_t* port_state) {
1192 if (port_state->active_poll_count > 0)
1193 port__update_events(port_state);
1196 static int port__feed_events(port_state_t* port_state,
1197 struct epoll_event* epoll_events,
1198 OVERLAPPED_ENTRY* iocp_events,
1199 DWORD iocp_event_count) {
1200 int epoll_event_count = 0;
1203 for (i = 0; i < iocp_event_count; i++) {
1204 IO_STATUS_BLOCK* io_status_block =
1205 (IO_STATUS_BLOCK*) iocp_events[i].lpOverlapped;
1206 struct epoll_event* ev = &epoll_events[epoll_event_count];
1208 epoll_event_count += sock_feed_event(port_state, io_status_block, ev);
1211 return epoll_event_count;
1214 static int port__poll(port_state_t* port_state,
1215 struct epoll_event* epoll_events,
1216 OVERLAPPED_ENTRY* iocp_events,
1219 DWORD completion_count;
1221 if (port__update_events(port_state) < 0)
1224 port_state->active_poll_count++;
1226 LeaveCriticalSection(&port_state->lock);
1228 BOOL r = GetQueuedCompletionStatusEx(port_state->iocp_handle,
1235 EnterCriticalSection(&port_state->lock);
1237 port_state->active_poll_count--;
1240 return_map_error(-1);
1242 return port__feed_events(
1243 port_state, epoll_events, iocp_events, completion_count);
1246 int port_wait(port_state_t* port_state,
1247 struct epoll_event* events,
1250 OVERLAPPED_ENTRY stack_iocp_events[PORT__MAX_ON_STACK_COMPLETIONS];
1251 OVERLAPPED_ENTRY* iocp_events;
1256 /* Check whether `maxevents` is in range. */
1258 return_set_error(-1, ERROR_INVALID_PARAMETER);
1260 /* Decide whether the IOCP completion list can live on the stack, or allocate
1261 * memory for it on the heap. */
1262 if ((size_t) maxevents <= array_count(stack_iocp_events)) {
1263 iocp_events = stack_iocp_events;
1264 } else if ((iocp_events =
1265 malloc((size_t) maxevents * sizeof *iocp_events)) == NULL) {
1266 iocp_events = stack_iocp_events;
1267 maxevents = array_count(stack_iocp_events);
1270 /* Compute the timeout for GetQueuedCompletionStatus, and the wait end
1271 * time, if the user specified a timeout other than zero or infinite. */
1273 due = GetTickCount64() + (uint64_t) timeout;
1274 gqcs_timeout = (DWORD) timeout;
1275 } else if (timeout == 0) {
1278 gqcs_timeout = INFINITE;
1281 EnterCriticalSection(&port_state->lock);
1283 /* Dequeue completion packets until either at least one interesting event
1284 * has been discovered, or the timeout is reached. */
1288 result = port__poll(
1289 port_state, events, iocp_events, (DWORD) maxevents, gqcs_timeout);
1290 if (result < 0 || result > 0)
1291 break; /* Result, error, or time-out. */
1294 continue; /* When timeout is negative, never time out. */
1297 now = GetTickCount64();
1299 /* Do not allow the due time to be in the past. */
1301 SetLastError(WAIT_TIMEOUT);
1305 /* Recompute time-out argument for GetQueuedCompletionStatus. */
1306 gqcs_timeout = (DWORD)(due - now);
1309 port__update_events_if_polling(port_state);
1311 LeaveCriticalSection(&port_state->lock);
1313 if (iocp_events != stack_iocp_events)
1318 else if (GetLastError() == WAIT_TIMEOUT)
1324 static int port__ctl_add(port_state_t* port_state,
1326 struct epoll_event* ev) {
1327 sock_state_t* sock_state = sock_new(port_state, sock);
1328 if (sock_state == NULL)
1331 if (sock_set_event(port_state, sock_state, ev) < 0) {
1332 sock_delete(port_state, sock_state);
1336 port__update_events_if_polling(port_state);
1341 static int port__ctl_mod(port_state_t* port_state,
1343 struct epoll_event* ev) {
1344 sock_state_t* sock_state = port_find_socket(port_state, sock);
1345 if (sock_state == NULL)
1348 if (sock_set_event(port_state, sock_state, ev) < 0)
1351 port__update_events_if_polling(port_state);
1356 static int port__ctl_del(port_state_t* port_state, SOCKET sock) {
1357 sock_state_t* sock_state = port_find_socket(port_state, sock);
1358 if (sock_state == NULL)
1361 sock_delete(port_state, sock_state);
1366 static int port__ctl_op(port_state_t* port_state,
1369 struct epoll_event* ev) {
1372 return port__ctl_add(port_state, sock, ev);
1374 return port__ctl_mod(port_state, sock, ev);
1376 return port__ctl_del(port_state, sock);
1378 return_set_error(-1, ERROR_INVALID_PARAMETER);
1382 int port_ctl(port_state_t* port_state,
1385 struct epoll_event* ev) {
1388 EnterCriticalSection(&port_state->lock);
1389 result = port__ctl_op(port_state, op, sock, ev);
1390 LeaveCriticalSection(&port_state->lock);
1395 int port_register_socket_handle(port_state_t* port_state,
1396 sock_state_t* sock_state,
1398 if (tree_add(&port_state->sock_tree,
1399 sock_state_to_tree_node(sock_state),
1401 return_set_error(-1, ERROR_ALREADY_EXISTS);
1405 void port_unregister_socket_handle(port_state_t* port_state,
1406 sock_state_t* sock_state) {
1407 tree_del(&port_state->sock_tree, sock_state_to_tree_node(sock_state));
1410 sock_state_t* port_find_socket(port_state_t* port_state, SOCKET socket) {
1411 tree_node_t* tree_node = tree_find(&port_state->sock_tree, socket);
1412 if (tree_node == NULL)
1413 return_set_error(NULL, ERROR_NOT_FOUND);
1414 return sock_state_from_tree_node(tree_node);
1417 void port_request_socket_update(port_state_t* port_state,
1418 sock_state_t* sock_state) {
1419 if (queue_enqueued(sock_state_to_queue_node(sock_state)))
1421 queue_append(&port_state->sock_update_queue,
1422 sock_state_to_queue_node(sock_state));
1425 void port_cancel_socket_update(port_state_t* port_state,
1426 sock_state_t* sock_state) {
1427 unused_var(port_state);
1428 if (!queue_enqueued(sock_state_to_queue_node(sock_state)))
1430 queue_remove(sock_state_to_queue_node(sock_state));
1433 void port_add_deleted_socket(port_state_t* port_state,
1434 sock_state_t* sock_state) {
1435 if (queue_enqueued(sock_state_to_queue_node(sock_state)))
1437 queue_append(&port_state->sock_deleted_queue,
1438 sock_state_to_queue_node(sock_state));
1441 void port_remove_deleted_socket(port_state_t* port_state,
1442 sock_state_t* sock_state) {
1443 unused_var(port_state);
1444 if (!queue_enqueued(sock_state_to_queue_node(sock_state)))
1446 queue_remove(sock_state_to_queue_node(sock_state));
1449 HANDLE port_get_iocp_handle(port_state_t* port_state) {
1450 assert(port_state->iocp_handle != NULL);
1451 return port_state->iocp_handle;
1454 queue_t* port_get_poll_group_queue(port_state_t* port_state) {
1455 return &port_state->poll_group_queue;
1458 port_state_t* port_state_from_handle_tree_node(ts_tree_node_t* tree_node) {
1459 return container_of(tree_node, port_state_t, handle_tree_node);
1462 ts_tree_node_t* port_state_to_handle_tree_node(port_state_t* port_state) {
1463 return &port_state->handle_tree_node;
1466 void queue_init(queue_t* queue) {
1467 queue_node_init(&queue->head);
1470 void queue_node_init(queue_node_t* node) {
1475 static inline void queue__detach_node(queue_node_t* node) {
1476 node->prev->next = node->next;
1477 node->next->prev = node->prev;
1480 queue_node_t* queue_first(const queue_t* queue) {
1481 return !queue_empty(queue) ? queue->head.next : NULL;
1484 queue_node_t* queue_last(const queue_t* queue) {
1485 return !queue_empty(queue) ? queue->head.prev : NULL;
1488 void queue_prepend(queue_t* queue, queue_node_t* node) {
1489 node->next = queue->head.next;
1490 node->prev = &queue->head;
1491 node->next->prev = node;
1492 queue->head.next = node;
1495 void queue_append(queue_t* queue, queue_node_t* node) {
1496 node->next = &queue->head;
1497 node->prev = queue->head.prev;
1498 node->prev->next = node;
1499 queue->head.prev = node;
1502 void queue_move_first(queue_t* queue, queue_node_t* node) {
1503 queue__detach_node(node);
1504 queue_prepend(queue, node);
1507 void queue_move_last(queue_t* queue, queue_node_t* node) {
1508 queue__detach_node(node);
1509 queue_append(queue, node);
1512 void queue_remove(queue_node_t* node) {
1513 queue__detach_node(node);
1514 queue_node_init(node);
1517 bool queue_empty(const queue_t* queue) {
1518 return !queue_enqueued(&queue->head);
1521 bool queue_enqueued(const queue_node_t* node) {
1522 return node->prev != node;
1525 static const long REFLOCK__REF = (long) 0x00000001;
1526 static const long REFLOCK__REF_MASK = (long) 0x0fffffff;
1527 static const long REFLOCK__DESTROY = (long) 0x10000000;
1528 static const long REFLOCK__DESTROY_MASK = (long) 0xf0000000;
1529 static const long REFLOCK__POISON = (long) 0x300dead0;
1531 static HANDLE reflock__keyed_event = NULL;
1533 int reflock_global_init(void) {
1534 NTSTATUS status = NtCreateKeyedEvent(
1535 &reflock__keyed_event, KEYEDEVENT_ALL_ACCESS, NULL, 0);
1536 if (status != STATUS_SUCCESS)
1537 return_set_error(-1, RtlNtStatusToDosError(status));
1541 void reflock_init(reflock_t* reflock) {
1545 static void reflock__signal_event(void* address) {
1547 NtReleaseKeyedEvent(reflock__keyed_event, address, FALSE, NULL);
1548 if (status != STATUS_SUCCESS)
1552 static void reflock__await_event(void* address) {
1554 NtWaitForKeyedEvent(reflock__keyed_event, address, FALSE, NULL);
1555 if (status != STATUS_SUCCESS)
1559 void reflock_ref(reflock_t* reflock) {
1560 long state = InterlockedAdd(&reflock->state, REFLOCK__REF);
1562 /* Verify that the counter didn't overflow and the lock isn't destroyed. */
1563 assert((state & REFLOCK__DESTROY_MASK) == 0);
1567 void reflock_unref(reflock_t* reflock) {
1568 long state = InterlockedAdd(&reflock->state, -REFLOCK__REF);
1570 /* Verify that the lock was referenced and not already destroyed. */
1571 assert((state & REFLOCK__DESTROY_MASK & ~REFLOCK__DESTROY) == 0);
1573 if (state == REFLOCK__DESTROY)
1574 reflock__signal_event(reflock);
1577 void reflock_unref_and_destroy(reflock_t* reflock) {
1579 InterlockedAdd(&reflock->state, REFLOCK__DESTROY - REFLOCK__REF);
1580 long ref_count = state & REFLOCK__REF_MASK;
1582 /* Verify that the lock was referenced and not already destroyed. */
1583 assert((state & REFLOCK__DESTROY_MASK) == REFLOCK__DESTROY);
1586 reflock__await_event(reflock);
1588 state = InterlockedExchange(&reflock->state, REFLOCK__POISON);
1589 assert(state == REFLOCK__DESTROY);
1592 static const uint32_t SOCK__KNOWN_EPOLL_EVENTS =
1593 EPOLLIN | EPOLLPRI | EPOLLOUT | EPOLLERR | EPOLLHUP | EPOLLRDNORM |
1594 EPOLLRDBAND | EPOLLWRNORM | EPOLLWRBAND | EPOLLMSG | EPOLLRDHUP;
1596 typedef enum sock__poll_status {
1597 SOCK__POLL_IDLE = 0,
1599 SOCK__POLL_CANCELLED
1600 } sock__poll_status_t;
1602 typedef struct sock_state {
1603 IO_STATUS_BLOCK io_status_block;
1604 AFD_POLL_INFO poll_info;
1605 queue_node_t queue_node;
1606 tree_node_t tree_node;
1607 poll_group_t* poll_group;
1609 epoll_data_t user_data;
1610 uint32_t user_events;
1611 uint32_t pending_events;
1612 sock__poll_status_t poll_status;
1613 bool delete_pending;
1616 static inline sock_state_t* sock__alloc(void) {
1617 sock_state_t* sock_state = malloc(sizeof *sock_state);
1618 if (sock_state == NULL)
1619 return_set_error(NULL, ERROR_NOT_ENOUGH_MEMORY);
1623 static inline void sock__free(sock_state_t* sock_state) {
1627 static int sock__cancel_poll(sock_state_t* sock_state) {
1628 assert(sock_state->poll_status == SOCK__POLL_PENDING);
1630 if (afd_cancel_poll(poll_group_get_afd_helper_handle(sock_state->poll_group),
1631 &sock_state->io_status_block) < 0)
1634 sock_state->poll_status = SOCK__POLL_CANCELLED;
1635 sock_state->pending_events = 0;
1639 sock_state_t* sock_new(port_state_t* port_state, SOCKET socket) {
1641 poll_group_t* poll_group;
1642 sock_state_t* sock_state;
1644 if (socket == 0 || socket == INVALID_SOCKET)
1645 return_set_error(NULL, ERROR_INVALID_HANDLE);
1647 base_socket = ws_get_base_socket(socket);
1648 if (base_socket == INVALID_SOCKET)
1651 poll_group = poll_group_acquire(port_state);
1652 if (poll_group == NULL)
1655 sock_state = sock__alloc();
1656 if (sock_state == NULL)
1659 memset(sock_state, 0, sizeof *sock_state);
1661 sock_state->base_socket = base_socket;
1662 sock_state->poll_group = poll_group;
1664 tree_node_init(&sock_state->tree_node);
1665 queue_node_init(&sock_state->queue_node);
1667 if (port_register_socket_handle(port_state, sock_state, socket) < 0)
1673 sock__free(sock_state);
1675 poll_group_release(poll_group);
1680 static int sock__delete(port_state_t* port_state,
1681 sock_state_t* sock_state,
1683 if (!sock_state->delete_pending) {
1684 if (sock_state->poll_status == SOCK__POLL_PENDING)
1685 sock__cancel_poll(sock_state);
1687 port_cancel_socket_update(port_state, sock_state);
1688 port_unregister_socket_handle(port_state, sock_state);
1690 sock_state->delete_pending = true;
1693 /* If the poll request still needs to complete, the sock_state object can't
1694 * be free()d yet. `sock_feed_event()` or `port_close()` will take care
1696 if (force || sock_state->poll_status == SOCK__POLL_IDLE) {
1697 /* Free the sock_state now. */
1698 port_remove_deleted_socket(port_state, sock_state);
1699 poll_group_release(sock_state->poll_group);
1700 sock__free(sock_state);
1702 /* Free the socket later. */
1703 port_add_deleted_socket(port_state, sock_state);
1709 void sock_delete(port_state_t* port_state, sock_state_t* sock_state) {
1710 sock__delete(port_state, sock_state, false);
1713 void sock_force_delete(port_state_t* port_state, sock_state_t* sock_state) {
1714 sock__delete(port_state, sock_state, true);
1717 int sock_set_event(port_state_t* port_state,
1718 sock_state_t* sock_state,
1719 const struct epoll_event* ev) {
1720 /* EPOLLERR and EPOLLHUP are always reported, even when not requested by the
1721 * caller. However they are disabled after a event has been reported for a
1722 * socket for which the EPOLLONESHOT flag as set. */
1723 uint32_t events = ev->events | EPOLLERR | EPOLLHUP;
1725 sock_state->user_events = events;
1726 sock_state->user_data = ev->data;
1728 if ((events & SOCK__KNOWN_EPOLL_EVENTS & ~sock_state->pending_events) != 0)
1729 port_request_socket_update(port_state, sock_state);
1734 static inline DWORD sock__epoll_events_to_afd_events(uint32_t epoll_events) {
1735 /* Always monitor for AFD_POLL_LOCAL_CLOSE, which is triggered when the
1736 * socket is closed with closesocket() or CloseHandle(). */
1737 DWORD afd_events = AFD_POLL_LOCAL_CLOSE;
1739 if (epoll_events & (EPOLLIN | EPOLLRDNORM))
1740 afd_events |= AFD_POLL_RECEIVE | AFD_POLL_ACCEPT;
1741 if (epoll_events & (EPOLLPRI | EPOLLRDBAND))
1742 afd_events |= AFD_POLL_RECEIVE_EXPEDITED;
1743 if (epoll_events & (EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND))
1744 afd_events |= AFD_POLL_SEND;
1745 if (epoll_events & (EPOLLIN | EPOLLRDNORM | EPOLLRDHUP))
1746 afd_events |= AFD_POLL_DISCONNECT;
1747 if (epoll_events & EPOLLHUP)
1748 afd_events |= AFD_POLL_ABORT;
1749 if (epoll_events & EPOLLERR)
1750 afd_events |= AFD_POLL_CONNECT_FAIL;
1755 static inline uint32_t sock__afd_events_to_epoll_events(DWORD afd_events) {
1756 uint32_t epoll_events = 0;
1758 if (afd_events & (AFD_POLL_RECEIVE | AFD_POLL_ACCEPT))
1759 epoll_events |= EPOLLIN | EPOLLRDNORM;
1760 if (afd_events & AFD_POLL_RECEIVE_EXPEDITED)
1761 epoll_events |= EPOLLPRI | EPOLLRDBAND;
1762 if (afd_events & AFD_POLL_SEND)
1763 epoll_events |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
1764 if (afd_events & AFD_POLL_DISCONNECT)
1765 epoll_events |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1766 if (afd_events & AFD_POLL_ABORT)
1767 epoll_events |= EPOLLHUP;
1768 if (afd_events & AFD_POLL_CONNECT_FAIL)
1769 /* Linux reports all these events after connect() has failed. */
1771 EPOLLIN | EPOLLOUT | EPOLLERR | EPOLLRDNORM | EPOLLWRNORM | EPOLLRDHUP;
1773 return epoll_events;
1776 int sock_update(port_state_t* port_state, sock_state_t* sock_state) {
1777 assert(!sock_state->delete_pending);
1779 if ((sock_state->poll_status == SOCK__POLL_PENDING) &&
1780 (sock_state->user_events & SOCK__KNOWN_EPOLL_EVENTS &
1781 ~sock_state->pending_events) == 0) {
1782 /* All the events the user is interested in are already being monitored by
1783 * the pending poll operation. It might spuriously complete because of an
1784 * event that we're no longer interested in; when that happens we'll submit
1785 * a new poll operation with the updated event mask. */
1787 } else if (sock_state->poll_status == SOCK__POLL_PENDING) {
1788 /* A poll operation is already pending, but it's not monitoring for all the
1789 * events that the user is interested in. Therefore, cancel the pending
1790 * poll operation; when we receive it's completion package, a new poll
1791 * operation will be submitted with the correct event mask. */
1792 if (sock__cancel_poll(sock_state) < 0)
1795 } else if (sock_state->poll_status == SOCK__POLL_CANCELLED) {
1796 /* The poll operation has already been cancelled, we're still waiting for
1797 * it to return. For now, there's nothing that needs to be done. */
1799 } else if (sock_state->poll_status == SOCK__POLL_IDLE) {
1800 /* No poll operation is pending; start one. */
1801 sock_state->poll_info.Exclusive = FALSE;
1802 sock_state->poll_info.NumberOfHandles = 1;
1803 sock_state->poll_info.Timeout.QuadPart = INT64_MAX;
1804 sock_state->poll_info.Handles[0].Handle = (HANDLE) sock_state->base_socket;
1805 sock_state->poll_info.Handles[0].Status = 0;
1806 sock_state->poll_info.Handles[0].Events =
1807 sock__epoll_events_to_afd_events(sock_state->user_events);
1809 if (afd_poll(poll_group_get_afd_helper_handle(sock_state->poll_group),
1810 &sock_state->poll_info,
1811 &sock_state->io_status_block) < 0) {
1812 switch (GetLastError()) {
1813 case ERROR_IO_PENDING:
1814 /* Overlapped poll operation in progress; this is expected. */
1816 case ERROR_INVALID_HANDLE:
1817 /* Socket closed; it'll be dropped from the epoll set. */
1818 return sock__delete(port_state, sock_state, false);
1820 /* Other errors are propagated to the caller. */
1821 return_map_error(-1);
1825 /* The poll request was successfully submitted. */
1826 sock_state->poll_status = SOCK__POLL_PENDING;
1827 sock_state->pending_events = sock_state->user_events;
1834 port_cancel_socket_update(port_state, sock_state);
1838 int sock_feed_event(port_state_t* port_state,
1839 IO_STATUS_BLOCK* io_status_block,
1840 struct epoll_event* ev) {
1841 sock_state_t* sock_state =
1842 container_of(io_status_block, sock_state_t, io_status_block);
1843 AFD_POLL_INFO* poll_info = &sock_state->poll_info;
1844 uint32_t epoll_events = 0;
1846 sock_state->poll_status = SOCK__POLL_IDLE;
1847 sock_state->pending_events = 0;
1849 if (sock_state->delete_pending) {
1850 /* Socket has been deleted earlier and can now be freed. */
1851 return sock__delete(port_state, sock_state, false);
1853 } else if (io_status_block->Status == STATUS_CANCELLED) {
1854 /* The poll request was cancelled by CancelIoEx. */
1856 } else if (!NT_SUCCESS(io_status_block->Status)) {
1857 /* The overlapped request itself failed in an unexpected way. */
1858 epoll_events = EPOLLERR;
1860 } else if (poll_info->NumberOfHandles < 1) {
1861 /* This poll operation succeeded but didn't report any socket events. */
1863 } else if (poll_info->Handles[0].Events & AFD_POLL_LOCAL_CLOSE) {
1864 /* The poll operation reported that the socket was closed. */
1865 return sock__delete(port_state, sock_state, false);
1868 /* Events related to our socket were reported. */
1870 sock__afd_events_to_epoll_events(poll_info->Handles[0].Events);
1873 /* Requeue the socket so a new poll request will be submitted. */
1874 port_request_socket_update(port_state, sock_state);
1876 /* Filter out events that the user didn't ask for. */
1877 epoll_events &= sock_state->user_events;
1879 /* Return if there are no epoll events to report. */
1880 if (epoll_events == 0)
1883 /* If the the socket has the EPOLLONESHOT flag set, unmonitor all events,
1884 * even EPOLLERR and EPOLLHUP. But always keep looking for closed sockets. */
1885 if (sock_state->user_events & EPOLLONESHOT)
1886 sock_state->user_events = 0;
1888 ev->data = sock_state->user_data;
1889 ev->events = epoll_events;
1893 sock_state_t* sock_state_from_queue_node(queue_node_t* queue_node) {
1894 return container_of(queue_node, sock_state_t, queue_node);
1897 queue_node_t* sock_state_to_queue_node(sock_state_t* sock_state) {
1898 return &sock_state->queue_node;
1901 sock_state_t* sock_state_from_tree_node(tree_node_t* tree_node) {
1902 return container_of(tree_node, sock_state_t, tree_node);
1905 tree_node_t* sock_state_to_tree_node(sock_state_t* sock_state) {
1906 return &sock_state->tree_node;
1909 void ts_tree_init(ts_tree_t* ts_tree) {
1910 tree_init(&ts_tree->tree);
1911 InitializeSRWLock(&ts_tree->lock);
1914 void ts_tree_node_init(ts_tree_node_t* node) {
1915 tree_node_init(&node->tree_node);
1916 reflock_init(&node->reflock);
1919 int ts_tree_add(ts_tree_t* ts_tree, ts_tree_node_t* node, uintptr_t key) {
1922 AcquireSRWLockExclusive(&ts_tree->lock);
1923 r = tree_add(&ts_tree->tree, &node->tree_node, key);
1924 ReleaseSRWLockExclusive(&ts_tree->lock);
1929 static inline ts_tree_node_t* ts_tree__find_node(ts_tree_t* ts_tree,
1931 tree_node_t* tree_node = tree_find(&ts_tree->tree, key);
1932 if (tree_node == NULL)
1935 return container_of(tree_node, ts_tree_node_t, tree_node);
1938 ts_tree_node_t* ts_tree_del_and_ref(ts_tree_t* ts_tree, uintptr_t key) {
1939 ts_tree_node_t* ts_tree_node;
1941 AcquireSRWLockExclusive(&ts_tree->lock);
1943 ts_tree_node = ts_tree__find_node(ts_tree, key);
1944 if (ts_tree_node != NULL) {
1945 tree_del(&ts_tree->tree, &ts_tree_node->tree_node);
1946 reflock_ref(&ts_tree_node->reflock);
1949 ReleaseSRWLockExclusive(&ts_tree->lock);
1951 return ts_tree_node;
1954 ts_tree_node_t* ts_tree_find_and_ref(ts_tree_t* ts_tree, uintptr_t key) {
1955 ts_tree_node_t* ts_tree_node;
1957 AcquireSRWLockShared(&ts_tree->lock);
1959 ts_tree_node = ts_tree__find_node(ts_tree, key);
1960 if (ts_tree_node != NULL)
1961 reflock_ref(&ts_tree_node->reflock);
1963 ReleaseSRWLockShared(&ts_tree->lock);
1965 return ts_tree_node;
1968 void ts_tree_node_unref(ts_tree_node_t* node) {
1969 reflock_unref(&node->reflock);
1972 void ts_tree_node_unref_and_destroy(ts_tree_node_t* node) {
1973 reflock_unref_and_destroy(&node->reflock);
1976 void tree_init(tree_t* tree) {
1977 memset(tree, 0, sizeof *tree);
1980 void tree_node_init(tree_node_t* node) {
1981 memset(node, 0, sizeof *node);
1984 #define TREE__ROTATE(cis, trans) \
1985 tree_node_t* p = node; \
1986 tree_node_t* q = node->trans; \
1987 tree_node_t* parent = p->parent; \
1990 if (parent->left == p) \
1993 parent->right = q; \
1998 q->parent = parent; \
2000 p->trans = q->cis; \
2002 p->trans->parent = p; \
2005 static inline void tree__rotate_left(tree_t* tree, tree_node_t* node) {
2006 TREE__ROTATE(left, right)
2009 static inline void tree__rotate_right(tree_t* tree, tree_node_t* node) {
2010 TREE__ROTATE(right, left)
2013 #define TREE__INSERT_OR_DESCEND(side) \
2014 if (parent->side) { \
2015 parent = parent->side; \
2017 parent->side = node; \
2021 #define TREE__REBALANCE_AFTER_INSERT(cis, trans) \
2022 tree_node_t* grandparent = parent->parent; \
2023 tree_node_t* uncle = grandparent->trans; \
2025 if (uncle && uncle->red) { \
2026 parent->red = uncle->red = false; \
2027 grandparent->red = true; \
2028 node = grandparent; \
2030 if (node == parent->trans) { \
2031 tree__rotate_##cis(tree, parent); \
2033 parent = node->parent; \
2035 parent->red = false; \
2036 grandparent->red = true; \
2037 tree__rotate_##trans(tree, grandparent); \
2040 int tree_add(tree_t* tree, tree_node_t* node, uintptr_t key) {
2041 tree_node_t* parent;
2043 parent = tree->root;
2046 if (key < parent->key) {
2047 TREE__INSERT_OR_DESCEND(left)
2048 } else if (key > parent->key) {
2049 TREE__INSERT_OR_DESCEND(right)
2059 node->left = node->right = NULL;
2060 node->parent = parent;
2063 for (; parent && parent->red; parent = node->parent) {
2064 if (parent == parent->parent->left) {
2065 TREE__REBALANCE_AFTER_INSERT(left, right)
2067 TREE__REBALANCE_AFTER_INSERT(right, left)
2070 tree->root->red = false;
2075 #define TREE__REBALANCE_AFTER_REMOVE(cis, trans) \
2076 tree_node_t* sibling = parent->trans; \
2078 if (sibling->red) { \
2079 sibling->red = false; \
2080 parent->red = true; \
2081 tree__rotate_##cis(tree, parent); \
2082 sibling = parent->trans; \
2084 if ((sibling->left && sibling->left->red) || \
2085 (sibling->right && sibling->right->red)) { \
2086 if (!sibling->trans || !sibling->trans->red) { \
2087 sibling->cis->red = false; \
2088 sibling->red = true; \
2089 tree__rotate_##trans(tree, sibling); \
2090 sibling = parent->trans; \
2092 sibling->red = parent->red; \
2093 parent->red = sibling->trans->red = false; \
2094 tree__rotate_##cis(tree, parent); \
2095 node = tree->root; \
2098 sibling->red = true;
2100 void tree_del(tree_t* tree, tree_node_t* node) {
2101 tree_node_t* parent = node->parent;
2102 tree_node_t* left = node->left;
2103 tree_node_t* right = node->right;
2109 } else if (!right) {
2118 if (parent->left == node)
2119 parent->left = next;
2121 parent->right = next;
2126 if (left && right) {
2128 next->red = node->red;
2130 left->parent = next;
2131 if (next != right) {
2132 parent = next->parent;
2133 next->parent = node->parent;
2135 parent->left = node;
2136 next->right = right;
2137 right->parent = next;
2139 next->parent = parent;
2149 node->parent = parent;
2152 if (node && node->red) {
2158 if (node == tree->root)
2160 if (node == parent->left) {
2161 TREE__REBALANCE_AFTER_REMOVE(left, right)
2163 TREE__REBALANCE_AFTER_REMOVE(right, left)
2166 parent = parent->parent;
2167 } while (!node->red);
2173 tree_node_t* tree_find(const tree_t* tree, uintptr_t key) {
2174 tree_node_t* node = tree->root;
2176 if (key < node->key)
2178 else if (key > node->key)
2186 tree_node_t* tree_root(const tree_t* tree) {
2190 #ifndef SIO_BASE_HANDLE
2191 #define SIO_BASE_HANDLE 0x48000022
2194 int ws_global_init(void) {
2198 r = WSAStartup(MAKEWORD(2, 2), &wsa_data);
2200 return_set_error(-1, (DWORD) r);
2205 SOCKET ws_get_base_socket(SOCKET socket) {
2209 if (WSAIoctl(socket,
2217 NULL) == SOCKET_ERROR)
2218 return_map_error(INVALID_SOCKET);