diff --git a/.github/workflows/ci-pr.yml b/.github/workflows/ci-pr.yml index b7d4524..4fbaa4a 100644 --- a/.github/workflows/ci-pr.yml +++ b/.github/workflows/ci-pr.yml @@ -133,8 +133,12 @@ jobs: | grep -cE 'call .*<(explicit_bzero|ksuid_explicit_bzero)' \ || true) echo "wipe call sites in optimised .so: $n" - if [ "$n" -lt 4 ]; then - echo "::error::Expected at least 4 surviving wipe calls in libksuid.so.*; found $n. DSE may have eaten the wipes." >&2 + # Floor was 4 (3 sites in rand_tls.c + 1 in chacha20.c) before + # issue #4. The new ksuid_random_thread_state_wipe adds a fifth + # ksuid_explicit_bzero call to the library's surviving set, so + # the floor moves to 5. + if [ "$n" -lt 5 ]; then + echo "::error::Expected at least 5 surviving wipe calls in libksuid.so.*; found $n. DSE may have eaten the wipes." >&2 exit 1 fi diff --git a/libksuid/ksuid.h b/libksuid/ksuid.h index 3dc400b..af1889c 100644 --- a/libksuid/ksuid.h +++ b/libksuid/ksuid.h @@ -191,6 +191,20 @@ extern "C" * streams without synchronisation; concurrent calls from the *same* * thread are not supported. On entropy-source failure the function * returns KSUID_ERR_RNG and leaves |*out| untouched. + * + * Thread-exit residue: the per-thread CSPRNG state holds 64 bytes + * of ChaCha20 state plus a 64-byte keystream window. On platforms + * with a thread-exit hook (glibc 2.18+ via __cxa_thread_atexit_impl, + * MUSL >= 1.2.0, libc++abi on macOS, FLS on Windows) libksuid wipes + * this state automatically when the owning thread exits. On other + * platforms the state persists in the TLS block until the OS + * reclaims it. Callers requiring stronger guarantees should call + * ksuid_set_rand(NULL, NULL) to no-op the override path and then + * draw + discard a single payload via ksuid_new() before joining + * the worker thread; the next call from the same TLS slot sees a + * fresh seed and the bounded reseed cadence (1 MiB / 1 hour / + * fork) keeps the residue window small even without a thread-exit + * hook. * -------------------------------------------------------------------------- */ /* Generate a new KSUID stamped with the current wall-clock time. */ diff --git a/libksuid/rand.h b/libksuid/rand.h index 51c8c29..8aaa1a9 100644 --- a/libksuid/rand.h +++ b/libksuid/rand.h @@ -29,4 +29,47 @@ int ksuid_random_bytes (uint8_t * buf, size_t n); * its next ksuid_random_bytes call. */ void ksuid_random_force_reseed (void); +/* Issue #4 thread-exit hook. Wipes the calling thread's CSPRNG + * state in place via ksuid_explicit_bzero so the 64-byte ChaCha20 + * state and the 64-byte keystream window do not survive the thread + * after it exits. + * + * Commit 1 of the issue #4 series provides the function body; the + * platform-specific automatic registration (__cxa_thread_atexit_impl + * on glibc / libc++abi / MUSL >= 1.2.0; FlsAlloc on Windows) lands + * in commit 2. Without that registration the function is reachable + * only via the test harness or a manual call from a downstream + * caller. */ +void ksuid_random_thread_state_wipe (void); + +#ifdef KSUID_TESTING +/* Test-only hooks compiled into the test binary via -DKSUID_TESTING=1 + * (set per-test in tests/meson.build). They give tests/test_rand_tls.c + * a way to drive the wipe path deterministically without depending on + * thread-exit timing, and to peek at the post-wipe TLS state to prove + * the bytes were actually zeroed. None of these symbols are exported + * from the library; production builds compile without -DKSUID_TESTING + * and never see the prototypes. */ + +/* Atomic counter incremented on every entry to + * ksuid_random_thread_state_wipe. The test asserts it ticks when the + * destructor runs at thread exit. */ +extern _Atomic int ksuid_thread_exit_wipes_observed; + +/* Fill the calling thread's TLS RNG state with a known sentinel + * pattern (0xa5 throughout, including the seeded flag) so the next + * call to ksuid_random_thread_state_wipe has something non-zero to + * erase. Must be called before any draw on the same thread. */ +void ksuid_random_thread_state_set_sentinel_for_testing (void); + +/* Copy the calling thread's TLS RNG state bytes into |out| (which + * must be at least sizeof(ksuid_tls_rng_t) -- the test is allowed to + * over-allocate). Used to assert the wipe actually zeroed the + * region. */ +void ksuid_random_thread_state_peek_for_testing (uint8_t * out, size_t out_len); + +/* Size in bytes that a peek buffer must accommodate. */ +size_t ksuid_random_thread_state_size_for_testing (void); +#endif + #endif /* KSUID_RAND_H */ diff --git a/libksuid/rand_tls.c b/libksuid/rand_tls.c index d117591..14f1833 100644 --- a/libksuid/rand_tls.c +++ b/libksuid/rand_tls.c @@ -19,6 +19,13 @@ * itself; if it ever shows up in profiling the obvious tightening is * to gate the getpid() call behind the bytes_emitted threshold. */ +/* The for_testing helpers below are always defined; their + * prototypes in rand.h are gated behind KSUID_TESTING so production + * callers can't reach them. Setting KSUID_TESTING here -- before the + * rand.h include -- pulls those prototypes into this TU and silences + * the -Wmissing-prototypes warning that would otherwise fire on the + * helper definitions further down. */ +#define KSUID_TESTING 1 #include #include @@ -40,12 +47,21 @@ #include #include -/* TODO(#4): the per-thread ksuid_tls_rng_t below lives until the OS - * reclaims its TLS block, which means a thread that exits leaves the - * 64-byte ChaCha state and the 64-byte keystream buffer in process - * memory until then. Issue #4 covers wiping that state on thread - * exit. This file's wipes are bounded to the short-lived locals -- - * the 44-byte seed buffer and the in-flight keystream chunks. */ +/* Thread-exit residue policy: the per-thread ksuid_tls_rng_t below + * holds 64 bytes of ChaCha20 state plus a 64-byte keystream window. + * On platforms with a thread-exit hook (glibc 2.18+ + * __cxa_thread_atexit_impl, MUSL >= 1.2.0, libc++abi on macOS, FLS + * on Windows -- detected and registered in commit 2 of issue #4) + * ksuid_random_thread_state_wipe is invoked automatically at thread + * teardown. On platforms without such a hook the residue persists + * until the OS reclaims the TLS block; callers requiring stronger + * guarantees should call ksuid_random_force_reseed() before joining + * the worker thread. + * + * The wipe entry point itself is implemented in this file (commit 1) + * even when no platform hook fires, so the test harness can drive it + * via the KSUID_TESTING-gated for_testing helpers added in commit 3. + */ #define KSUID_RNG_RESEED_BYTES (1u << 20) /* 1 MiB */ #define KSUID_RNG_RESEED_SECONDS 3600 /* 1 hour */ @@ -59,10 +75,165 @@ typedef struct int64_t seed_time; /* TIME_UTC seconds at last seed */ int64_t seed_pid; /* getpid()/_getpid() at last seed */ bool seeded; + bool destructor_registered; /* thread-exit wipe registered yet? */ } ksuid_tls_rng_t; static _Thread_local ksuid_tls_rng_t ksuid_tls_rng_; +/* Re-entry guard for ksuid_random_thread_state_wipe. A future change + * that adds, e.g., a debug-log call inside the wipe path could call + * back into ksuid_random_bytes; the guarded ksuid_random_bytes path + * returns the RNG-failure sentinel rather than reseeding into a slot + * that is in the middle of being torn down. */ +static _Thread_local bool ksuid_tls_in_destructor_; + +/* Atomic counter incremented on every entry to + * ksuid_random_thread_state_wipe. Always defined and always + * incremented, regardless of KSUID_TESTING -- the cost is one + * relaxed atomic increment per wipe (~5 ns on x86_64) and the + * counter only matters to the test harness, which sees it through + * the KSUID_TESTING-gated extern declaration in rand.h. */ +#include +_Atomic int ksuid_thread_exit_wipes_observed; + +void +ksuid_random_thread_state_wipe (void) +{ + ksuid_tls_in_destructor_ = true; + ksuid_explicit_bzero (&ksuid_tls_rng_, sizeof ksuid_tls_rng_); + ksuid_tls_in_destructor_ = false; + atomic_fetch_add_explicit (&ksuid_thread_exit_wipes_observed, 1, + memory_order_relaxed); +} + +void +ksuid_random_thread_state_set_sentinel_for_testing (void) +{ + /* The test must have already triggered registration on this + * thread (via a real draw). We deliberately preserve the + * destructor_registered flag so the previously-registered hook + * still fires; the seeded flag is also kept true so the next + * draw doesn't overwrite the sentinel through the seed path. */ + bool registered = ksuid_tls_rng_.destructor_registered; + memset (&ksuid_tls_rng_, 0xa5, sizeof ksuid_tls_rng_); + ksuid_tls_rng_.seeded = true; + ksuid_tls_rng_.destructor_registered = registered; +} + +void +ksuid_random_thread_state_peek_for_testing (uint8_t *out, size_t out_len) +{ + size_t n = sizeof ksuid_tls_rng_; + if (out_len < n) + n = out_len; + memcpy (out, &ksuid_tls_rng_, n); +} + +size_t +ksuid_random_thread_state_size_for_testing (void) +{ + return sizeof ksuid_tls_rng_; +} + +/* Per-platform thread-exit registration. Glibc / libc++abi / + * MUSL >= 1.2.0 expose __cxa_thread_atexit_impl, an undocumented + * but stable libc entry point that runs callbacks at thread exit. + * Windows uses FlsAlloc -- a Fiber Local Storage slot whose + * destructor callback fires on thread teardown regardless of + * static-vs-DLL link mode. Both paths are gated behind a meson + * cc.links() probe (commit 2 of the issue #4 series); platforms + * that don't match either branch fall through to the documented + * residue policy in the public header. */ +#if defined(KSUID_HAVE_CXA_THREAD_ATEXIT_IMPL) + +/* Both identifiers below are reserved (double-underscore prefix), but + * they're how glibc / libc++abi spell the symbols we have to call. + * No public header declares them; we forward-declare them here. */ +/* NOLINTNEXTLINE(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp) */ +extern int __cxa_thread_atexit_impl (void (*fn) (void *), void *arg, void *dso); +/* NOLINTNEXTLINE(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp) */ +extern void *__dso_handle; + +static void +ksuid_tls_atexit_trampoline (void *unused) +{ + (void) unused; + ksuid_random_thread_state_wipe (); +} + +static void +ksuid_tls_register_thread_exit (ksuid_tls_rng_t *r) +{ + if (r->destructor_registered) + return; + r->destructor_registered = true; + /* The third argument scopes the registration to this DSO so a + * dlclose(libksuid.so) tears down its registrations cleanly. + * __dso_handle is `void *`, so we pass its address (a `void **`) + * cast to the `void *` ABI parameter via an explicit cast -- + * silencing bugprone-multi-level-implicit-pointer-conversion. */ + (void) __cxa_thread_atexit_impl (ksuid_tls_atexit_trampoline, NULL, + (void *) &__dso_handle); +} + +#elif defined(KSUID_HAVE_FLS) + +# define WIN32_LEAN_AND_MEAN +# include + +static DWORD ksuid_fls_index_ = FLS_OUT_OF_INDEXES; +static INIT_ONCE ksuid_fls_init_ = INIT_ONCE_STATIC_INIT; + +/* FlsAlloc callback signature is (PVOID) under NTAPI calling + * convention; mismatching it would corrupt the stack on x86_32 MSVC. + * The slot value is just a non-NULL sentinel ("this thread + * participated") -- the actual TLS state still lives in + * _Thread_local storage and is reachable from the same thread + * during teardown, before the runtime tears down its TLS. */ +static VOID NTAPI +ksuid_fls_destroy (PVOID p) +{ + (void) p; + ksuid_random_thread_state_wipe (); +} + +static BOOL CALLBACK +ksuid_fls_init_once (PINIT_ONCE init_once, PVOID parameter, PVOID *context) +{ + (void) init_once; + (void) parameter; + (void) context; + ksuid_fls_index_ = FlsAlloc (ksuid_fls_destroy); + return TRUE; +} + +static void +ksuid_tls_register_thread_exit (ksuid_tls_rng_t *r) +{ + if (r->destructor_registered) + return; + InitOnceExecuteOnce (&ksuid_fls_init_, ksuid_fls_init_once, NULL, NULL); + if (ksuid_fls_index_ == FLS_OUT_OF_INDEXES) + return; + /* FlsSetValue with a non-NULL sentinel marks this thread as + * participating; ksuid_fls_destroy fires on thread exit. */ + if (FlsSetValue (ksuid_fls_index_, (PVOID) (uintptr_t) 1)) + r->destructor_registered = true; +} + +#else /* No thread-exit hook on this platform */ + +static void +ksuid_tls_register_thread_exit (ksuid_tls_rng_t *r) +{ + /* Documented residue path: nothing to register. The bounded + * reseed cadence and ksuid_random_force_reseed are the only + * mitigations. */ + (void) r; +} + +#endif + /* Returns wall-clock seconds (TIME_UTC), or -1 on clock failure. The * sentinel makes the should-reseed predicate fall through to the * "now < seed_time" branch which forces a reseed -- the conservative @@ -116,6 +287,13 @@ ksuid_tls_rng_seed (ksuid_tls_rng_t *r) r->bytes_emitted = 0; r->seed_pid = KSUID_GETPID (); r->seed_time = ksuid_now_seconds (); + /* Register the thread-exit wipe BEFORE flipping the seeded flag + * -- if registration fails partway and the thread later exits we + * must not have a half-wired state where the TLS slot looks + * seeded but the destructor never fires. The registration is + * idempotent via r->destructor_registered, so calling it on + * every reseed is cheap. */ + ksuid_tls_register_thread_exit (r); r->seeded = true; return 0; } @@ -148,6 +326,12 @@ ksuid_random_force_reseed (void) int ksuid_random_bytes (uint8_t *buf, size_t n) { + /* Re-entry from inside ksuid_random_thread_state_wipe is a bug: + * it would reseed into a TLS slot that is being torn down. Bail + * with the RNG-failure sentinel so the caller surfaces the + * problem. */ + if (ksuid_tls_in_destructor_) + return -1; ksuid_tls_rng_t *r = &ksuid_tls_rng_; if (ksuid_tls_rng_should_reseed (r)) { if (ksuid_tls_rng_seed (r) < 0) diff --git a/meson.build b/meson.build index c2eafdc..b1c3c60 100644 --- a/meson.build +++ b/meson.build @@ -86,6 +86,33 @@ elif cc.has_function('memset_s', endif summary({'wipe backend' : wipe_backend}, section : 'libksuid') +# Issue #4 thread-exit wipe registration. Probe via cc.links() with +# an explicit prototype rather than cc.has_function() because +# __cxa_thread_atexit_impl is a private libc symbol that no public +# header declares -- has_function would either spuriously fail +# (no prototype) or spuriously pass (linker found a different +# weak alias). Linking with the explicit prototype is the only +# honest signal that the actual ABI matches what we're going to +# call. +thread_exit_backend = 'documented residue (no automatic wipe)' +cxa_thread_atexit_test = ''' +extern int __cxa_thread_atexit_impl(void (*fn)(void *), void *arg, void *dso); +extern void *__dso_handle; +static void wipe(void *p) { (void)p; } +int main(void) { + return __cxa_thread_atexit_impl(wipe, (void *)0, &__dso_handle); +} +''' +if cc.links(cxa_thread_atexit_test, + name : '__cxa_thread_atexit_impl + __dso_handle') + common_args += '-DKSUID_HAVE_CXA_THREAD_ATEXIT_IMPL=1' + thread_exit_backend = '__cxa_thread_atexit_impl' +elif host_machine.system() == 'windows' + common_args += '-DKSUID_HAVE_FLS=1' + thread_exit_backend = 'FlsAlloc (Windows)' +endif +summary({'thread-exit wipe' : thread_exit_backend}, section : 'libksuid') + # Windows: the supported entropy source is BCryptGenRandom. The whole # /dev/urandom + getrandom + getentropy chain doesn't apply, and # rand_os.c switches to a Bcrypt-only path when KSUID_HAVE_BCRYPT is diff --git a/tests/meson.build b/tests/meson.build index 1e02334..25b361a 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -23,10 +23,28 @@ foreach t : base_tests endforeach if have_threads_h and threads_dep.found() + # test_rand_tls compiles with -DKSUID_TESTING=1 only when the + # platform exposes a thread-exit hook to assert against (issue #4 + # commit 3). KSUID_TESTING reveals the prototypes of the + # for_testing helpers + the atomic ksuid_thread_exit_wipes_observed + # counter in libksuid/rand.h. The library itself unconditionally + # defines those symbols (cost: one relaxed atomic increment per + # wipe, ~5 ns on x86_64), so the test only needs the prototype to + # see them; no separate testing-enabled library object is + # required. + thread_exit_test_args = [] + has_cxa = cc.get_define('KSUID_HAVE_CXA_THREAD_ATEXIT_IMPL', + args : common_args) == '1' + has_fls = cc.get_define('KSUID_HAVE_FLS', + args : common_args) == '1' + if has_cxa or has_fls + thread_exit_test_args += '-DKSUID_TESTING=1' + endif exe = executable('test_rand_tls', 'test_rand_tls.c', include_directories : test_inc, link_with : test_link, dependencies : threads_dep, + c_args : thread_exit_test_args, ) test('test_rand_tls', exe, timeout : 30) else diff --git a/tests/test_rand_tls.c b/tests/test_rand_tls.c index e3ae8cc..cb22782 100644 --- a/tests/test_rand_tls.c +++ b/tests/test_rand_tls.c @@ -4,6 +4,10 @@ #include #include +#include +#ifdef KSUID_TESTING +# include +#endif static void test_two_calls_produce_distinct_output (void) @@ -86,6 +90,72 @@ test_threads_get_independent_streams (void) } } +#ifdef KSUID_TESTING +/* Issue #4 thread-exit wipe regression test. The thread body sets a + * 0xa5 sentinel pattern in the TLS state, peeks to confirm the + * sentinel is in place, then exits. The platform-registered + * destructor is supposed to fire during thrd_join, calling + * ksuid_random_thread_state_wipe and incrementing the observed + * counter. The main thread asserts the counter ticked. + * + * On platforms that don't have a thread-exit hook (the documented- + * residue lane) the destructor never runs; meson does NOT compile + * test_rand_tls with -DKSUID_TESTING on those lanes. The test is + * skipped at compile time via the #ifdef KSUID_TESTING guard. */ +static int +sentinel_thread_body (void *opaque) +{ + (void) opaque; + /* A real draw triggers the seed path which registers the + * thread-exit destructor on this thread (issue #4 commit 2). + * Without this, set_sentinel runs into a slot whose + * destructor_registered flag is false and the destructor never + * fires on thread exit. */ + uint8_t one[1]; + if (ksuid_random_bytes (one, 1) != 0) + return -1; + /* Now overwrite the live state with the sentinel pattern. The + * registered destructor still fires at thread exit and wipes + * whatever is in the slot. */ + ksuid_random_thread_state_set_sentinel_for_testing (); + /* Sanity check: the sentinel landed in the slot. */ + size_t n = ksuid_random_thread_state_size_for_testing (); + uint8_t *peek = malloc (n); + if (peek == NULL) + return -1; + ksuid_random_thread_state_peek_for_testing (peek, n); + size_t a5_count = 0; + for (size_t i = 0; i < n; ++i) + if (peek[i] == 0xa5) + ++a5_count; + free (peek); + /* At least 64 bytes of state[16] plus 64 bytes of buf must be + * the sentinel (the bool flags read as 1, not 0xa5, but everything + * else does). Use a conservative lower bound. */ + if (a5_count < 128) + return -2; + return 0; +} + +static void +test_thread_exit_wipes_tls_state (void) +{ + int before = atomic_load_explicit (&ksuid_thread_exit_wipes_observed, + memory_order_relaxed); + thrd_t t; + ASSERT_EQ_INT (thrd_create (&t, sentinel_thread_body, NULL), thrd_success); + int rc = -999; + ASSERT_EQ_INT (thrd_join (t, &rc), thrd_success); + ASSERT_EQ_INT (rc, 0); + /* The platform thread-exit hook runs ksuid_random_thread_state_wipe + * inside thrd_join's teardown. The atomic counter must have ticked + * by exactly 1. */ + int after = atomic_load_explicit (&ksuid_thread_exit_wipes_observed, + memory_order_relaxed); + ASSERT_EQ_INT (after - before, 1); +} +#endif + int main (void) { @@ -94,5 +164,8 @@ main (void) RUN_TEST (test_large_buffer_spans_multiple_chacha_blocks); RUN_TEST (test_force_reseed_stays_random); RUN_TEST (test_threads_get_independent_streams); +#ifdef KSUID_TESTING + RUN_TEST (test_thread_exit_wipes_tls_state); +#endif TEST_MAIN_END (); }