Skip to content
This repository was archived by the owner on May 1, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .github/workflows/ci-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,12 @@ jobs:
| grep -cE 'call .*<(explicit_bzero|ksuid_explicit_bzero)' \
|| true)
echo "wipe call sites in optimised .so: $n"
if [ "$n" -lt 4 ]; then
echo "::error::Expected at least 4 surviving wipe calls in libksuid.so.*; found $n. DSE may have eaten the wipes." >&2
# Floor was 4 (3 sites in rand_tls.c + 1 in chacha20.c) before
# issue #4. The new ksuid_random_thread_state_wipe adds a fifth
# ksuid_explicit_bzero call to the library's surviving set, so
# the floor moves to 5.
if [ "$n" -lt 5 ]; then
echo "::error::Expected at least 5 surviving wipe calls in libksuid.so.*; found $n. DSE may have eaten the wipes." >&2
exit 1
fi

Expand Down
14 changes: 14 additions & 0 deletions libksuid/ksuid.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,20 @@ extern "C"
* streams without synchronisation; concurrent calls from the *same*
* thread are not supported. On entropy-source failure the function
* returns KSUID_ERR_RNG and leaves |*out| untouched.
*
* Thread-exit residue: the per-thread CSPRNG state holds 64 bytes
* of ChaCha20 state plus a 64-byte keystream window. On platforms
* with a thread-exit hook (glibc 2.18+ via __cxa_thread_atexit_impl,
* MUSL >= 1.2.0, libc++abi on macOS, FLS on Windows) libksuid wipes
* this state automatically when the owning thread exits. On other
* platforms the state persists in the TLS block until the OS
* reclaims it. Callers requiring stronger guarantees should call
* ksuid_set_rand(NULL, NULL) to no-op the override path and then
* draw + discard a single payload via ksuid_new() before joining
* the worker thread; the next call from the same TLS slot sees a
* fresh seed and the bounded reseed cadence (1 MiB / 1 hour /
* fork) keeps the residue window small even without a thread-exit
* hook.
* -------------------------------------------------------------------------- */

/* Generate a new KSUID stamped with the current wall-clock time. */
Expand Down
43 changes: 43 additions & 0 deletions libksuid/rand.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,47 @@ int ksuid_random_bytes (uint8_t * buf, size_t n);
* its next ksuid_random_bytes call. */
void ksuid_random_force_reseed (void);

/* Issue #4 thread-exit hook. Wipes the calling thread's CSPRNG
* state in place via ksuid_explicit_bzero so the 64-byte ChaCha20
* state and the 64-byte keystream window do not survive the thread
* after it exits.
*
* Commit 1 of the issue #4 series provides the function body; the
* platform-specific automatic registration (__cxa_thread_atexit_impl
* on glibc / libc++abi / MUSL >= 1.2.0; FlsAlloc on Windows) lands
* in commit 2. Without that registration the function is reachable
* only via the test harness or a manual call from a downstream
* caller. */
void ksuid_random_thread_state_wipe (void);

#ifdef KSUID_TESTING
/* Test-only hooks compiled into the test binary via -DKSUID_TESTING=1
* (set per-test in tests/meson.build). They give tests/test_rand_tls.c
* a way to drive the wipe path deterministically without depending on
* thread-exit timing, and to peek at the post-wipe TLS state to prove
* the bytes were actually zeroed. None of these symbols are exported
* from the library; production builds compile without -DKSUID_TESTING
* and never see the prototypes. */

/* Atomic counter incremented on every entry to
* ksuid_random_thread_state_wipe. The test asserts it ticks when the
* destructor runs at thread exit. */
extern _Atomic int ksuid_thread_exit_wipes_observed;

/* Fill the calling thread's TLS RNG state with a known sentinel
* pattern (0xa5 throughout, including the seeded flag) so the next
* call to ksuid_random_thread_state_wipe has something non-zero to
* erase. Must be called before any draw on the same thread. */
void ksuid_random_thread_state_set_sentinel_for_testing (void);

/* Copy the calling thread's TLS RNG state bytes into |out| (which
* must be at least sizeof(ksuid_tls_rng_t) -- the test is allowed to
* over-allocate). Used to assert the wipe actually zeroed the
* region. */
void ksuid_random_thread_state_peek_for_testing (uint8_t * out, size_t out_len);

/* Size in bytes that a peek buffer must accommodate. */
size_t ksuid_random_thread_state_size_for_testing (void);
#endif

#endif /* KSUID_RAND_H */
196 changes: 190 additions & 6 deletions libksuid/rand_tls.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@
* itself; if it ever shows up in profiling the obvious tightening is
* to gate the getpid() call behind the bytes_emitted threshold.
*/
/* The for_testing helpers below are always defined; their
* prototypes in rand.h are gated behind KSUID_TESTING so production
* callers can't reach them. Setting KSUID_TESTING here -- before the
* rand.h include -- pulls those prototypes into this TU and silences
* the -Wmissing-prototypes warning that would otherwise fire on the
* helper definitions further down. */
#define KSUID_TESTING 1
#include <libksuid/rand.h>

#include <stdbool.h>
Expand All @@ -40,12 +47,21 @@
#include <libksuid/chacha20.h>
#include <libksuid/wipe.h>

/* TODO(#4): the per-thread ksuid_tls_rng_t below lives until the OS
* reclaims its TLS block, which means a thread that exits leaves the
* 64-byte ChaCha state and the 64-byte keystream buffer in process
* memory until then. Issue #4 covers wiping that state on thread
* exit. This file's wipes are bounded to the short-lived locals --
* the 44-byte seed buffer and the in-flight keystream chunks. */
/* Thread-exit residue policy: the per-thread ksuid_tls_rng_t below
* holds 64 bytes of ChaCha20 state plus a 64-byte keystream window.
* On platforms with a thread-exit hook (glibc 2.18+
* __cxa_thread_atexit_impl, MUSL >= 1.2.0, libc++abi on macOS, FLS
* on Windows -- detected and registered in commit 2 of issue #4)
* ksuid_random_thread_state_wipe is invoked automatically at thread
* teardown. On platforms without such a hook the residue persists
* until the OS reclaims the TLS block; callers requiring stronger
* guarantees should call ksuid_random_force_reseed() before joining
* the worker thread.
*
* The wipe entry point itself is implemented in this file (commit 1)
* even when no platform hook fires, so the test harness can drive it
* via the KSUID_TESTING-gated for_testing helpers added in commit 3.
*/

#define KSUID_RNG_RESEED_BYTES (1u << 20) /* 1 MiB */
#define KSUID_RNG_RESEED_SECONDS 3600 /* 1 hour */
Expand All @@ -59,10 +75,165 @@ typedef struct
int64_t seed_time; /* TIME_UTC seconds at last seed */
int64_t seed_pid; /* getpid()/_getpid() at last seed */
bool seeded;
bool destructor_registered; /* thread-exit wipe registered yet? */
} ksuid_tls_rng_t;

static _Thread_local ksuid_tls_rng_t ksuid_tls_rng_;

/* Re-entry guard for ksuid_random_thread_state_wipe. A future change
* that adds, e.g., a debug-log call inside the wipe path could call
* back into ksuid_random_bytes; the guarded ksuid_random_bytes path
* returns the RNG-failure sentinel rather than reseeding into a slot
* that is in the middle of being torn down. */
static _Thread_local bool ksuid_tls_in_destructor_;

/* Atomic counter incremented on every entry to
* ksuid_random_thread_state_wipe. Always defined and always
* incremented, regardless of KSUID_TESTING -- the cost is one
* relaxed atomic increment per wipe (~5 ns on x86_64) and the
* counter only matters to the test harness, which sees it through
* the KSUID_TESTING-gated extern declaration in rand.h. */
#include <stdatomic.h>
_Atomic int ksuid_thread_exit_wipes_observed;

void
ksuid_random_thread_state_wipe (void)
{
ksuid_tls_in_destructor_ = true;
ksuid_explicit_bzero (&ksuid_tls_rng_, sizeof ksuid_tls_rng_);
ksuid_tls_in_destructor_ = false;
atomic_fetch_add_explicit (&ksuid_thread_exit_wipes_observed, 1,
memory_order_relaxed);
}

void
ksuid_random_thread_state_set_sentinel_for_testing (void)
{
/* The test must have already triggered registration on this
* thread (via a real draw). We deliberately preserve the
* destructor_registered flag so the previously-registered hook
* still fires; the seeded flag is also kept true so the next
* draw doesn't overwrite the sentinel through the seed path. */
bool registered = ksuid_tls_rng_.destructor_registered;
memset (&ksuid_tls_rng_, 0xa5, sizeof ksuid_tls_rng_);
ksuid_tls_rng_.seeded = true;
ksuid_tls_rng_.destructor_registered = registered;
}

void
ksuid_random_thread_state_peek_for_testing (uint8_t *out, size_t out_len)
{
size_t n = sizeof ksuid_tls_rng_;
if (out_len < n)
n = out_len;
memcpy (out, &ksuid_tls_rng_, n);
}

size_t
ksuid_random_thread_state_size_for_testing (void)
{
return sizeof ksuid_tls_rng_;
}

/* Per-platform thread-exit registration. Glibc / libc++abi /
* MUSL >= 1.2.0 expose __cxa_thread_atexit_impl, an undocumented
* but stable libc entry point that runs callbacks at thread exit.
* Windows uses FlsAlloc -- a Fiber Local Storage slot whose
* destructor callback fires on thread teardown regardless of
* static-vs-DLL link mode. Both paths are gated behind a meson
* cc.links() probe (commit 2 of the issue #4 series); platforms
* that don't match either branch fall through to the documented
* residue policy in the public header. */
#if defined(KSUID_HAVE_CXA_THREAD_ATEXIT_IMPL)

/* Both identifiers below are reserved (double-underscore prefix), but
* they're how glibc / libc++abi spell the symbols we have to call.
* No public header declares them; we forward-declare them here. */
/* NOLINTNEXTLINE(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp) */
extern int __cxa_thread_atexit_impl (void (*fn) (void *), void *arg, void *dso);
/* NOLINTNEXTLINE(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp) */
extern void *__dso_handle;

static void
ksuid_tls_atexit_trampoline (void *unused)
{
(void) unused;
ksuid_random_thread_state_wipe ();
}

static void
ksuid_tls_register_thread_exit (ksuid_tls_rng_t *r)
{
if (r->destructor_registered)
return;
r->destructor_registered = true;
/* The third argument scopes the registration to this DSO so a
* dlclose(libksuid.so) tears down its registrations cleanly.
* __dso_handle is `void *`, so we pass its address (a `void **`)
* cast to the `void *` ABI parameter via an explicit cast --
* silencing bugprone-multi-level-implicit-pointer-conversion. */
(void) __cxa_thread_atexit_impl (ksuid_tls_atexit_trampoline, NULL,
(void *) &__dso_handle);
}

#elif defined(KSUID_HAVE_FLS)

# define WIN32_LEAN_AND_MEAN
# include <windows.h>

static DWORD ksuid_fls_index_ = FLS_OUT_OF_INDEXES;
static INIT_ONCE ksuid_fls_init_ = INIT_ONCE_STATIC_INIT;

/* FlsAlloc callback signature is (PVOID) under NTAPI calling
* convention; mismatching it would corrupt the stack on x86_32 MSVC.
* The slot value is just a non-NULL sentinel ("this thread
* participated") -- the actual TLS state still lives in
* _Thread_local storage and is reachable from the same thread
* during teardown, before the runtime tears down its TLS. */
static VOID NTAPI
ksuid_fls_destroy (PVOID p)
{
(void) p;
ksuid_random_thread_state_wipe ();
}

static BOOL CALLBACK
ksuid_fls_init_once (PINIT_ONCE init_once, PVOID parameter, PVOID *context)
{
(void) init_once;
(void) parameter;
(void) context;
ksuid_fls_index_ = FlsAlloc (ksuid_fls_destroy);
return TRUE;
}

static void
ksuid_tls_register_thread_exit (ksuid_tls_rng_t *r)
{
if (r->destructor_registered)
return;
InitOnceExecuteOnce (&ksuid_fls_init_, ksuid_fls_init_once, NULL, NULL);
if (ksuid_fls_index_ == FLS_OUT_OF_INDEXES)
return;
/* FlsSetValue with a non-NULL sentinel marks this thread as
* participating; ksuid_fls_destroy fires on thread exit. */
if (FlsSetValue (ksuid_fls_index_, (PVOID) (uintptr_t) 1))
r->destructor_registered = true;
}

#else /* No thread-exit hook on this platform */

static void
ksuid_tls_register_thread_exit (ksuid_tls_rng_t *r)
{
/* Documented residue path: nothing to register. The bounded
* reseed cadence and ksuid_random_force_reseed are the only
* mitigations. */
(void) r;
}

#endif

/* Returns wall-clock seconds (TIME_UTC), or -1 on clock failure. The
* sentinel makes the should-reseed predicate fall through to the
* "now < seed_time" branch which forces a reseed -- the conservative
Expand Down Expand Up @@ -116,6 +287,13 @@ ksuid_tls_rng_seed (ksuid_tls_rng_t *r)
r->bytes_emitted = 0;
r->seed_pid = KSUID_GETPID ();
r->seed_time = ksuid_now_seconds ();
/* Register the thread-exit wipe BEFORE flipping the seeded flag
* -- if registration fails partway and the thread later exits we
* must not have a half-wired state where the TLS slot looks
* seeded but the destructor never fires. The registration is
* idempotent via r->destructor_registered, so calling it on
* every reseed is cheap. */
ksuid_tls_register_thread_exit (r);
r->seeded = true;
return 0;
}
Expand Down Expand Up @@ -148,6 +326,12 @@ ksuid_random_force_reseed (void)
int
ksuid_random_bytes (uint8_t *buf, size_t n)
{
/* Re-entry from inside ksuid_random_thread_state_wipe is a bug:
* it would reseed into a TLS slot that is being torn down. Bail
* with the RNG-failure sentinel so the caller surfaces the
* problem. */
if (ksuid_tls_in_destructor_)
return -1;
ksuid_tls_rng_t *r = &ksuid_tls_rng_;
if (ksuid_tls_rng_should_reseed (r)) {
if (ksuid_tls_rng_seed (r) < 0)
Expand Down
27 changes: 27 additions & 0 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,33 @@ elif cc.has_function('memset_s',
endif
summary({'wipe backend' : wipe_backend}, section : 'libksuid')

# Issue #4 thread-exit wipe registration. Probe via cc.links() with
# an explicit prototype rather than cc.has_function() because
# __cxa_thread_atexit_impl is a private libc symbol that no public
# header declares -- has_function would either spuriously fail
# (no prototype) or spuriously pass (linker found a different
# weak alias). Linking with the explicit prototype is the only
# honest signal that the actual ABI matches what we're going to
# call.
thread_exit_backend = 'documented residue (no automatic wipe)'
cxa_thread_atexit_test = '''
extern int __cxa_thread_atexit_impl(void (*fn)(void *), void *arg, void *dso);
extern void *__dso_handle;
static void wipe(void *p) { (void)p; }
int main(void) {
return __cxa_thread_atexit_impl(wipe, (void *)0, &__dso_handle);
}
'''
if cc.links(cxa_thread_atexit_test,
name : '__cxa_thread_atexit_impl + __dso_handle')
common_args += '-DKSUID_HAVE_CXA_THREAD_ATEXIT_IMPL=1'
thread_exit_backend = '__cxa_thread_atexit_impl'
elif host_machine.system() == 'windows'
common_args += '-DKSUID_HAVE_FLS=1'
thread_exit_backend = 'FlsAlloc (Windows)'
endif
summary({'thread-exit wipe' : thread_exit_backend}, section : 'libksuid')

# Windows: the supported entropy source is BCryptGenRandom. The whole
# /dev/urandom + getrandom + getentropy chain doesn't apply, and
# rand_os.c switches to a Bcrypt-only path when KSUID_HAVE_BCRYPT is
Expand Down
18 changes: 18 additions & 0 deletions tests/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,28 @@ foreach t : base_tests
endforeach

if have_threads_h and threads_dep.found()
# test_rand_tls compiles with -DKSUID_TESTING=1 only when the
# platform exposes a thread-exit hook to assert against (issue #4
# commit 3). KSUID_TESTING reveals the prototypes of the
# for_testing helpers + the atomic ksuid_thread_exit_wipes_observed
# counter in libksuid/rand.h. The library itself unconditionally
# defines those symbols (cost: one relaxed atomic increment per
# wipe, ~5 ns on x86_64), so the test only needs the prototype to
# see them; no separate testing-enabled library object is
# required.
thread_exit_test_args = []
has_cxa = cc.get_define('KSUID_HAVE_CXA_THREAD_ATEXIT_IMPL',
args : common_args) == '1'
has_fls = cc.get_define('KSUID_HAVE_FLS',
args : common_args) == '1'
if has_cxa or has_fls
thread_exit_test_args += '-DKSUID_TESTING=1'
endif
exe = executable('test_rand_tls', 'test_rand_tls.c',
include_directories : test_inc,
link_with : test_link,
dependencies : threads_dep,
c_args : thread_exit_test_args,
)
test('test_rand_tls', exe, timeout : 30)
else
Expand Down
Loading
Loading