mirror of
git://sourceware.org/git/glibc.git
synced 2024-11-27 03:41:23 +08:00
89b53077d2
The current racy approach is to enable asynchronous cancellation before making the syscall and restore the previous cancellation type once the syscall returns, and check if cancellation has happen during the cancellation entrypoint. As described in BZ#12683, this approach shows 2 problems: 1. Cancellation can act after the syscall has returned from the kernel, but before userspace saves the return value. It might result in a resource leak if the syscall allocated a resource or a side effect (partial read/write), and there is no way to program handle it with cancellation handlers. 2. If a signal is handled while the thread is blocked at a cancellable syscall, the entire signal handler runs with asynchronous cancellation enabled. This can lead to issues if the signal handler call functions which are async-signal-safe but not async-cancel-safe. For the cancellation to work correctly, there are 5 points at which the cancellation signal could arrive: [ ... )[ ... )[ syscall ]( ... 1 2 3 4 5 1. Before initial testcancel, e.g. [*... testcancel) 2. Between testcancel and syscall start, e.g. [testcancel...syscall start) 3. While syscall is blocked and no side effects have yet taken place, e.g. [ syscall ] 4. Same as 3 but with side-effects having occurred (e.g. a partial read or write). 5. After syscall end e.g. (syscall end...*] And libc wants to act on cancellation in cases 1, 2, and 3 but not in cases 4 or 5. For the 4 and 5 cases, the cancellation will eventually happen in the next cancellable entrypoint without any further external event. The proposed solution for each case is: 1. Do a conditional branch based on whether the thread has received a cancellation request; 2. It can be caught by the signal handler determining that the saved program counter (from the ucontext_t) is in some address range beginning just before the "testcancel" and ending with the syscall instruction. 3. SIGCANCEL can be caught by the signal handler and determine that the saved program counter (from the ucontext_t) is in the address range beginning just before "testcancel" and ending with the first uninterruptable (via a signal) syscall instruction that enters the kernel. 4. In this case, except for certain syscalls that ALWAYS fail with EINTR even for non-interrupting signals, the kernel will reset the program counter to point at the syscall instruction during signal handling, so that the syscall is restarted when the signal handler returns. So, from the signal handler's standpoint, this looks the same as case 2, and thus it's taken care of. 5. For syscalls with side-effects, the kernel cannot restart the syscall; when it's interrupted by a signal, the kernel must cause the syscall to return with whatever partial result is obtained (e.g. partial read or write). 6. The saved program counter points just after the syscall instruction, so the signal handler won't act on cancellation. This is similar to 4. since the program counter is past the syscall instruction. So The proposed fixes are: 1. Remove the enable_asynccancel/disable_asynccancel function usage in cancellable syscall definition and instead make them call a common symbol that will check if cancellation is enabled (__syscall_cancel at nptl/cancellation.c), call the arch-specific cancellable entry-point (__syscall_cancel_arch), and cancel the thread when required. 2. Provide an arch-specific generic system call wrapper function that contains global markers. These markers will be used in SIGCANCEL signal handler to check if the interruption has been called in a valid syscall and if the syscalls has side-effects. A reference implementation sysdeps/unix/sysv/linux/syscall_cancel.c is provided. However, the markers may not be set on correct expected places depending on how INTERNAL_SYSCALL_NCS is implemented by the architecture. It is expected that all architectures add an arch-specific implementation. 3. Rewrite SIGCANCEL asynchronous handler to check for both canceling type and if current IP from signal handler falls between the global markers and act accordingly. 4. Adjust libc code to replace LIBC_CANCEL_ASYNC/LIBC_CANCEL_RESET to use the appropriate cancelable syscalls. 5. Adjust 'lowlevellock-futex.h' arch-specific implementations to provide cancelable futex calls. Some architectures require specific support on syscall handling: * On i386 the syscall cancel bridge needs to use the old int80 instruction because the optimized vDSO symbol the resulting PC value for an interrupted syscall points to an address outside the expected markers in __syscall_cancel_arch. It has been discussed in LKML [1] on how kernel could help userland to accomplish it, but afaik discussion has stalled. Also, sysenter should not be used directly by libc since its calling convention is set by the kernel depending of the underlying x86 chip (check kernel commit 30bfa7b3488bfb1bb75c9f50a5fcac1832970c60). * mips o32 is the only kABI that requires 7 argument syscall, and to avoid add a requirement on all architectures to support it, mips support is added with extra internal defines. Checked on aarch64-linux-gnu, arm-linux-gnueabihf, powerpc-linux-gnu, powerpc64-linux-gnu, powerpc64le-linux-gnu, i686-linux-gnu, and x86_64-linux-gnu. [1] https://lkml.org/lkml/2016/3/8/1105 Reviewed-by: Carlos O'Donell <carlos@redhat.com>
470 lines
15 KiB
C
470 lines
15 KiB
C
/* Copyright (C) 2002-2024 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
#ifndef _DESCR_H
|
|
#define _DESCR_H 1
|
|
|
|
#include <limits.h>
|
|
#include <sched.h>
|
|
#include <setjmp.h>
|
|
#include <stdbool.h>
|
|
#include <sys/types.h>
|
|
#include <hp-timing.h>
|
|
#include <list_t.h>
|
|
#include <lowlevellock.h>
|
|
#include <pthreaddef.h>
|
|
#include <dl-sysdep.h>
|
|
#include <thread_db.h>
|
|
#include <tls.h>
|
|
#include <unwind.h>
|
|
#include <bits/types/res_state.h>
|
|
#include <kernel-features.h>
|
|
#include <tls-internal-struct.h>
|
|
#include <internal-sigset.h>
|
|
|
|
#ifndef TCB_ALIGNMENT
|
|
# define TCB_ALIGNMENT 32
|
|
#elif TCB_ALIGNMENT < 32
|
|
# error TCB_ALIGNMENT must be at least 32
|
|
#endif
|
|
|
|
|
|
/* We keep thread specific data in a special data structure, a two-level
|
|
array. The top-level array contains pointers to dynamically allocated
|
|
arrays of a certain number of data pointers. So we can implement a
|
|
sparse array. Each dynamic second-level array has
|
|
PTHREAD_KEY_2NDLEVEL_SIZE
|
|
entries. This value shouldn't be too large. */
|
|
#define PTHREAD_KEY_2NDLEVEL_SIZE 32
|
|
|
|
/* We need to address PTHREAD_KEYS_MAX key with PTHREAD_KEY_2NDLEVEL_SIZE
|
|
keys in each subarray. */
|
|
#define PTHREAD_KEY_1STLEVEL_SIZE \
|
|
((PTHREAD_KEYS_MAX + PTHREAD_KEY_2NDLEVEL_SIZE - 1) \
|
|
/ PTHREAD_KEY_2NDLEVEL_SIZE)
|
|
|
|
|
|
|
|
|
|
/* Internal version of the buffer to store cancellation handler
|
|
information. */
|
|
struct pthread_unwind_buf
|
|
{
|
|
struct
|
|
{
|
|
__jmp_buf jmp_buf;
|
|
int mask_was_saved;
|
|
} cancel_jmp_buf[1];
|
|
|
|
union
|
|
{
|
|
/* This is the placeholder of the public version. */
|
|
void *pad[4];
|
|
|
|
struct
|
|
{
|
|
/* Pointer to the previous cleanup buffer. */
|
|
struct pthread_unwind_buf *prev;
|
|
|
|
/* Backward compatibility: state of the old-style cleanup
|
|
handler at the time of the previous new-style cleanup handler
|
|
installment. */
|
|
struct _pthread_cleanup_buffer *cleanup;
|
|
|
|
/* Cancellation type before the push call. */
|
|
int canceltype;
|
|
} data;
|
|
} priv;
|
|
};
|
|
|
|
|
|
/* Opcodes and data types for communication with the signal handler to
|
|
change user/group IDs. */
|
|
struct xid_command
|
|
{
|
|
int syscall_no;
|
|
/* Enforce zero-extension for the pointer argument in
|
|
|
|
int setgroups (size_t size, const gid_t *list);
|
|
|
|
The kernel XID arguments are unsigned and do not require sign
|
|
extension. */
|
|
unsigned long int id[3];
|
|
volatile int cntr;
|
|
volatile int error; /* -1: no call yet, 0: success seen, >0: error seen. */
|
|
};
|
|
|
|
|
|
/* Data structure used by the kernel to find robust futexes. */
|
|
struct robust_list_head
|
|
{
|
|
void *list;
|
|
long int futex_offset;
|
|
void *list_op_pending;
|
|
};
|
|
|
|
|
|
/* Data structure used to handle thread priority protection. */
|
|
struct priority_protection_data
|
|
{
|
|
int priomax;
|
|
unsigned int priomap[];
|
|
};
|
|
|
|
|
|
/* Thread descriptor data structure. */
|
|
struct pthread
|
|
{
|
|
union
|
|
{
|
|
#if !TLS_DTV_AT_TP
|
|
/* This overlaps the TCB as used for TLS without threads (see tls.h). */
|
|
tcbhead_t header;
|
|
#else
|
|
struct
|
|
{
|
|
/* multiple_threads is enabled either when the process has spawned at
|
|
least one thread or when a single-threaded process cancels itself.
|
|
This enables additional code to introduce locking before doing some
|
|
compare_and_exchange operations and also enable cancellation points.
|
|
The concepts of multiple threads and cancellation points ideally
|
|
should be separate, since it is not necessary for multiple threads to
|
|
have been created for cancellation points to be enabled, as is the
|
|
case is when single-threaded process cancels itself.
|
|
|
|
Since enabling multiple_threads enables additional code in
|
|
cancellation points and compare_and_exchange operations, there is a
|
|
potential for an unneeded performance hit when it is enabled in a
|
|
single-threaded, self-canceling process. This is OK though, since a
|
|
single-threaded process will enable async cancellation only when it
|
|
looks to cancel itself and is hence going to end anyway. */
|
|
int multiple_threads;
|
|
int gscope_flag;
|
|
} header;
|
|
#endif
|
|
|
|
/* This extra padding has no special purpose, and this structure layout
|
|
is private and subject to change without affecting the official ABI.
|
|
We just have it here in case it might be convenient for some
|
|
implementation-specific instrumentation hack or suchlike. */
|
|
void *__padding[24];
|
|
};
|
|
|
|
/* This descriptor's link on the GL (dl_stack_used) or
|
|
GL (dl_stack_user) list. */
|
|
list_t list;
|
|
|
|
/* Thread ID - which is also a 'is this thread descriptor (and
|
|
therefore stack) used' flag. */
|
|
pid_t tid;
|
|
|
|
/* List of robust mutexes the thread is holding. */
|
|
#if __PTHREAD_MUTEX_HAVE_PREV
|
|
void *robust_prev;
|
|
struct robust_list_head robust_head;
|
|
|
|
/* The list above is strange. It is basically a double linked list
|
|
but the pointer to the next/previous element of the list points
|
|
in the middle of the object, the __next element. Whenever
|
|
casting to __pthread_list_t we need to adjust the pointer
|
|
first.
|
|
These operations are effectively concurrent code in that the thread
|
|
can get killed at any point in time and the kernel takes over. Thus,
|
|
the __next elements are a kind of concurrent list and we need to
|
|
enforce using compiler barriers that the individual operations happen
|
|
in such a way that the kernel always sees a consistent list. The
|
|
backward links (ie, the __prev elements) are not used by the kernel.
|
|
FIXME We should use relaxed MO atomic operations here and signal fences
|
|
because this kind of concurrency is similar to synchronizing with a
|
|
signal handler. */
|
|
# define QUEUE_PTR_ADJUST (offsetof (__pthread_list_t, __next))
|
|
|
|
# define ENQUEUE_MUTEX_BOTH(mutex, val) \
|
|
do { \
|
|
__pthread_list_t *next = (__pthread_list_t *) \
|
|
((((uintptr_t) THREAD_GETMEM (THREAD_SELF, robust_head.list)) & ~1ul) \
|
|
- QUEUE_PTR_ADJUST); \
|
|
next->__prev = (void *) &mutex->__data.__list.__next; \
|
|
mutex->__data.__list.__next = THREAD_GETMEM (THREAD_SELF, \
|
|
robust_head.list); \
|
|
mutex->__data.__list.__prev = (void *) &THREAD_SELF->robust_head; \
|
|
/* Ensure that the new list entry is ready before we insert it. */ \
|
|
__asm ("" ::: "memory"); \
|
|
THREAD_SETMEM (THREAD_SELF, robust_head.list, \
|
|
(void *) (((uintptr_t) &mutex->__data.__list.__next) \
|
|
| val)); \
|
|
} while (0)
|
|
# define DEQUEUE_MUTEX(mutex) \
|
|
do { \
|
|
__pthread_list_t *next = (__pthread_list_t *) \
|
|
((char *) (((uintptr_t) mutex->__data.__list.__next) & ~1ul) \
|
|
- QUEUE_PTR_ADJUST); \
|
|
next->__prev = mutex->__data.__list.__prev; \
|
|
__pthread_list_t *prev = (__pthread_list_t *) \
|
|
((char *) (((uintptr_t) mutex->__data.__list.__prev) & ~1ul) \
|
|
- QUEUE_PTR_ADJUST); \
|
|
prev->__next = mutex->__data.__list.__next; \
|
|
/* Ensure that we remove the entry from the list before we change the \
|
|
__next pointer of the entry, which is read by the kernel. */ \
|
|
__asm ("" ::: "memory"); \
|
|
mutex->__data.__list.__prev = NULL; \
|
|
mutex->__data.__list.__next = NULL; \
|
|
} while (0)
|
|
#else
|
|
union
|
|
{
|
|
__pthread_slist_t robust_list;
|
|
struct robust_list_head robust_head;
|
|
};
|
|
|
|
# define ENQUEUE_MUTEX_BOTH(mutex, val) \
|
|
do { \
|
|
mutex->__data.__list.__next \
|
|
= THREAD_GETMEM (THREAD_SELF, robust_list.__next); \
|
|
/* Ensure that the new list entry is ready before we insert it. */ \
|
|
__asm ("" ::: "memory"); \
|
|
THREAD_SETMEM (THREAD_SELF, robust_list.__next, \
|
|
(void *) (((uintptr_t) &mutex->__data.__list) | val)); \
|
|
} while (0)
|
|
# define DEQUEUE_MUTEX(mutex) \
|
|
do { \
|
|
__pthread_slist_t *runp = (__pthread_slist_t *) \
|
|
(((uintptr_t) THREAD_GETMEM (THREAD_SELF, robust_list.__next)) & ~1ul); \
|
|
if (runp == &mutex->__data.__list) \
|
|
THREAD_SETMEM (THREAD_SELF, robust_list.__next, runp->__next); \
|
|
else \
|
|
{ \
|
|
__pthread_slist_t *next = (__pthread_slist_t *) \
|
|
(((uintptr_t) runp->__next) & ~1ul); \
|
|
while (next != &mutex->__data.__list) \
|
|
{ \
|
|
runp = next; \
|
|
next = (__pthread_slist_t *) (((uintptr_t) runp->__next) & ~1ul); \
|
|
} \
|
|
\
|
|
runp->__next = next->__next; \
|
|
/* Ensure that we remove the entry from the list before we change the \
|
|
__next pointer of the entry, which is read by the kernel. */ \
|
|
__asm ("" ::: "memory"); \
|
|
mutex->__data.__list.__next = NULL; \
|
|
} \
|
|
} while (0)
|
|
#endif
|
|
#define ENQUEUE_MUTEX(mutex) ENQUEUE_MUTEX_BOTH (mutex, 0)
|
|
#define ENQUEUE_MUTEX_PI(mutex) ENQUEUE_MUTEX_BOTH (mutex, 1)
|
|
|
|
/* List of cleanup buffers. */
|
|
struct _pthread_cleanup_buffer *cleanup;
|
|
|
|
/* Unwind information. */
|
|
struct pthread_unwind_buf *cleanup_jmp_buf;
|
|
#define HAVE_CLEANUP_JMP_BUF
|
|
|
|
/* Flags determining processing of cancellation. */
|
|
int cancelhandling;
|
|
/* Bit set if cancellation is disabled. */
|
|
#define CANCELSTATE_BIT 0
|
|
#define CANCELSTATE_BITMASK (1 << CANCELSTATE_BIT)
|
|
/* Bit set if asynchronous cancellation mode is selected. */
|
|
#define CANCELTYPE_BIT 1
|
|
#define CANCELTYPE_BITMASK (1 << CANCELTYPE_BIT)
|
|
/* Bit set if canceling has been initiated. */
|
|
#define CANCELING_BIT 2
|
|
#define CANCELING_BITMASK (1 << CANCELING_BIT)
|
|
/* Bit set if canceled. */
|
|
#define CANCELED_BIT 3
|
|
#define CANCELED_BITMASK (1 << CANCELED_BIT)
|
|
/* Bit set if thread is exiting. */
|
|
#define EXITING_BIT 4
|
|
#define EXITING_BITMASK (1 << EXITING_BIT)
|
|
/* Bit set if thread terminated and TCB is freed. */
|
|
#define TERMINATED_BIT 5
|
|
#define TERMINATED_BITMASK (1 << TERMINATED_BIT)
|
|
/* Bit set if thread is supposed to change XID. */
|
|
#define SETXID_BIT 6
|
|
#define SETXID_BITMASK (1 << SETXID_BIT)
|
|
|
|
/* Flags. Including those copied from the thread attribute. */
|
|
int flags;
|
|
|
|
/* We allocate one block of references here. This should be enough
|
|
to avoid allocating any memory dynamically for most applications. */
|
|
struct pthread_key_data
|
|
{
|
|
/* Sequence number. We use uintptr_t to not require padding on
|
|
32- and 64-bit machines. On 64-bit machines it helps to avoid
|
|
wrapping, too. */
|
|
uintptr_t seq;
|
|
|
|
/* Data pointer. */
|
|
void *data;
|
|
} specific_1stblock[PTHREAD_KEY_2NDLEVEL_SIZE];
|
|
|
|
/* Two-level array for the thread-specific data. */
|
|
struct pthread_key_data *specific[PTHREAD_KEY_1STLEVEL_SIZE];
|
|
|
|
/* Flag which is set when specific data is set. */
|
|
bool specific_used;
|
|
|
|
/* True if events must be reported. */
|
|
bool report_events;
|
|
|
|
/* True if the user provided the stack. */
|
|
bool user_stack;
|
|
|
|
/* True if thread must stop at startup time. */
|
|
bool stopped_start;
|
|
|
|
/* Indicate that a thread creation setup has failed (for instance the
|
|
scheduler or affinity). */
|
|
int setup_failed;
|
|
|
|
/* Lock to synchronize access to the descriptor. */
|
|
int lock;
|
|
|
|
/* Lock for synchronizing setxid calls. */
|
|
unsigned int setxid_futex;
|
|
|
|
/* If the thread waits to join another one the ID of the latter is
|
|
stored here.
|
|
|
|
In case a thread is detached this field contains a pointer of the
|
|
TCB if the thread itself. This is something which cannot happen
|
|
in normal operation. */
|
|
struct pthread *joinid;
|
|
/* Check whether a thread is detached. */
|
|
#define IS_DETACHED(pd) ((pd)->joinid == (pd))
|
|
|
|
/* The result of the thread function. */
|
|
void *result;
|
|
|
|
/* Scheduling parameters for the new thread. */
|
|
struct sched_param schedparam;
|
|
int schedpolicy;
|
|
|
|
/* Start position of the code to be executed and the argument passed
|
|
to the function. */
|
|
void *(*start_routine) (void *);
|
|
void *arg;
|
|
|
|
/* Debug state. */
|
|
td_eventbuf_t eventbuf;
|
|
/* Next descriptor with a pending event. */
|
|
struct pthread *nextevent;
|
|
|
|
/* Machine-specific unwind info. */
|
|
struct _Unwind_Exception exc;
|
|
|
|
/* If nonzero, pointer to the area allocated for the stack and guard. */
|
|
void *stackblock;
|
|
/* Size of the stackblock area including the guard. */
|
|
size_t stackblock_size;
|
|
/* Size of the included guard area. */
|
|
size_t guardsize;
|
|
/* This is what the user specified and what we will report. */
|
|
size_t reported_guardsize;
|
|
|
|
/* Thread Priority Protection data. */
|
|
struct priority_protection_data *tpp;
|
|
|
|
/* Resolver state. */
|
|
struct __res_state res;
|
|
|
|
/* Signal mask for the new thread. Used during thread startup to
|
|
restore the signal mask. (Threads are launched with all signals
|
|
masked.) */
|
|
internal_sigset_t sigmask;
|
|
|
|
/* Used by the exception handling implementation in the dynamic loader. */
|
|
struct rtld_catch *rtld_catch;
|
|
|
|
/* Indicates whether is a C11 thread created by thrd_creat. */
|
|
bool c11;
|
|
|
|
/* Used in __pthread_kill_internal to detected a thread that has
|
|
exited or is about to exit. exit_lock must only be acquired
|
|
after blocking signals. */
|
|
bool exiting;
|
|
int exit_lock; /* A low-level lock (for use with __libc_lock_init etc). */
|
|
|
|
/* Used on strsignal. */
|
|
struct tls_internal_t tls_state;
|
|
|
|
/* rseq area registered with the kernel. Use a custom definition
|
|
here to isolate from kernel struct rseq changes. The
|
|
implementation of sched_getcpu needs acccess to the cpu_id field;
|
|
the other fields are unused and not included here. */
|
|
union
|
|
{
|
|
struct
|
|
{
|
|
uint32_t cpu_id_start;
|
|
uint32_t cpu_id;
|
|
};
|
|
char pad[32]; /* Original rseq area size. */
|
|
} rseq_area __attribute__ ((aligned (32)));
|
|
|
|
/* Amount of end padding, if any, in this structure.
|
|
This definition relies on rseq_area being last. */
|
|
#define PTHREAD_STRUCT_END_PADDING \
|
|
(sizeof (struct pthread) - offsetof (struct pthread, rseq_area) \
|
|
+ sizeof ((struct pthread) {}.rseq_area))
|
|
} __attribute ((aligned (TCB_ALIGNMENT)));
|
|
|
|
static inline bool
|
|
cancel_enabled (int value)
|
|
{
|
|
return (value & CANCELSTATE_BITMASK) == 0;
|
|
}
|
|
|
|
static inline bool
|
|
cancel_async_enabled (int value)
|
|
{
|
|
return (value & CANCELTYPE_BITMASK) != 0;
|
|
}
|
|
|
|
static inline bool
|
|
cancel_exiting (int value)
|
|
{
|
|
return (value & EXITING_BITMASK) != 0;
|
|
}
|
|
|
|
static inline bool
|
|
cancel_enabled_and_canceled (int value)
|
|
{
|
|
return (value & (CANCELSTATE_BITMASK | CANCELED_BITMASK | EXITING_BITMASK
|
|
| TERMINATED_BITMASK))
|
|
== CANCELED_BITMASK;
|
|
}
|
|
|
|
static inline bool
|
|
cancel_enabled_and_canceled_and_async (int value)
|
|
{
|
|
return ((value) & (CANCELSTATE_BITMASK | CANCELTYPE_BITMASK | CANCELED_BITMASK
|
|
| EXITING_BITMASK | TERMINATED_BITMASK))
|
|
== (CANCELTYPE_BITMASK | CANCELED_BITMASK);
|
|
}
|
|
|
|
/* This yields the pointer that TLS support code calls the thread pointer. */
|
|
#if TLS_TCB_AT_TP
|
|
# define TLS_TPADJ(pd) (pd)
|
|
#elif TLS_DTV_AT_TP
|
|
# define TLS_TPADJ(pd) ((struct pthread *)((char *) (pd) + TLS_PRE_TCB_SIZE))
|
|
#endif
|
|
|
|
#endif /* descr.h */
|