diff --git a/.github/workflows/ci-pr.yml b/.github/workflows/ci-pr.yml index 5cf5255..b7d4524 100644 --- a/.github/workflows/ci-pr.yml +++ b/.github/workflows/ci-pr.yml @@ -105,6 +105,39 @@ jobs: call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat" meson test -C builddir --print-errorlogs + - name: Verify ksuid_explicit_bzero survives optimization + if: runner.os == 'Linux' && matrix.compiler == 'gcc' + run: | + # Issue #2: the DSE-resistant wipe in libksuid/wipe.h must + # actually emit a call (or equivalent indirect call) at + # every site in rand_tls.c and chacha20.c. A plain memset + # would be elided by -O2; explicit_bzero / SecureZeroMemory / + # the volatile-fn-ptr fallback must NOT be. This step proves + # the shim's calls survive into the optimised library by + # grepping the disassembly for either form: a direct call + # to explicit_bzero@plt (where the static-inline shim got + # inlined into its caller) or a call to the out-of-line + # ksuid_explicit_bzero symbol (where the compiler kept + # the shim as a function). At least four wipe sites exist + # in source: three in rand_tls.c plus one in chacha20.c, so + # counting >=4 surviving calls is the correctness floor. + # Path uses find -type f to skip symlinks AND the + # libksuid.so..p object-archive directory that meson + # leaves alongside the real .so. + set -eux + shared=$(find builddir -maxdepth 1 -type f -name 'libksuid.so.*' \ + | grep -E 'libksuid\.so\.[0-9]+\.[0-9]+\.[0-9]+$' \ + | head -1) + test -n "$shared" || { echo "::error::no versioned libksuid.so found in builddir" >&2; exit 1; } + n=$(objdump -d "$shared" \ + | grep -cE 'call .*<(explicit_bzero|ksuid_explicit_bzero)' \ + || true) + echo "wipe call sites in optimised .so: $n" + if [ "$n" -lt 4 ]; then + echo "::error::Expected at least 4 surviving wipe calls in libksuid.so.*; found $n. DSE may have eaten the wipes." >&2 + exit 1 + fi + - name: Verify install lays down license artifacts if: runner.os == 'Linux' run: | @@ -124,7 +157,61 @@ jobs: find staging -name 'libksuid.so' -print | grep -q . # ========================================================================== - # Phase 2b: meson dist round-trip + # Phase 2b: Wipe-shim fallback path coverage (issue #2) + # The default build on every supported matrix lane resolves + # ksuid_explicit_bzero to a platform primitive (explicit_bzero, + # SecureZeroMemory, or memset_s). The portable volatile-fn-ptr + # fallback in libksuid/wipe.h therefore ships *unexercised* unless + # CI explicitly forces it. This job builds with + # KSUID_FORCE_VOLATILE_FALLBACK=1, which disables every primitive + # branch and exercises the fallback. test_wipe asserts the + # fallback still zeroes the buffers it is given (it does not + # assert DSE-resistance -- the auto-build disasm gate above is + # the DSE-resistance witness; this job witnesses fallback + # correctness on a host that would otherwise never run the path). + # ========================================================================== + wipe-fallback: + name: Wipe shim fallback path coverage + needs: [lint] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y meson ninja-build + + - name: Configure with KSUID_FORCE_VOLATILE_FALLBACK + run: | + meson setup builddir-fb \ + -Dc_args=-DKSUID_FORCE_VOLATILE_FALLBACK=1 + + - name: Build + run: meson compile -C builddir-fb + + - name: Run test_wipe under fallback shim + run: meson test -C builddir-fb test_wipe --print-errorlogs + + - name: Confirm fallback path actually skipped explicit_bzero + run: | + # On the fallback path the shim must NOT emit a call to + # explicit_bzero@plt anywhere in the optimised library; + # grepping for any such reference catches a regression + # where a future contributor adds a primitive without + # gating on KSUID_FORCE_VOLATILE_FALLBACK. + set -eux + shared=$(find builddir-fb -maxdepth 1 -type f \ + -name 'libksuid.so.*' \ + | grep -E 'libksuid\.so\.[0-9]+\.[0-9]+\.[0-9]+$' \ + | head -1) + if objdump -d "$shared" | grep -qE 'call .*'; then + echo "::error::Fallback build leaked an explicit_bzero@plt call; KSUID_FORCE_VOLATILE_FALLBACK is not gating the primitive." >&2 + exit 1 + fi + + # ========================================================================== + # Phase 2c: meson dist round-trip # Builds a source tarball, extracts it into a clean tree, and rebuilds # the library + tests from the tarball. Catches reproducibility # regressions in configure_file templates (R2 from issue #3): if a diff --git a/libksuid/chacha20.c b/libksuid/chacha20.c index c40518e..0593638 100644 --- a/libksuid/chacha20.c +++ b/libksuid/chacha20.c @@ -12,6 +12,8 @@ #include +#include + #define ROTL32(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) #define QR(a, b, c, d) do { \ @@ -55,4 +57,11 @@ ksuid_chacha20_block (uint8_t out[64], uint32_t state[16]) state[12]++; if (state[12] == 0) state[13]++; + + /* Wipe the local x[16] before returning. After the round-mixing it + * holds the keystream-mixed words, which a sibling stack frame + * with a stack-read primitive could lift. Defense in depth: cost + * is ~64 bytes wiped per 64-byte block, dominated by the chacha + * computation itself. */ + ksuid_explicit_bzero (x, sizeof x); } diff --git a/libksuid/rand_tls.c b/libksuid/rand_tls.c index 1320e09..d117591 100644 --- a/libksuid/rand_tls.c +++ b/libksuid/rand_tls.c @@ -38,6 +38,14 @@ #endif #include +#include + +/* TODO(#4): the per-thread ksuid_tls_rng_t below lives until the OS + * reclaims its TLS block, which means a thread that exits leaves the + * 64-byte ChaCha state and the 64-byte keystream buffer in process + * memory until then. Issue #4 covers wiping that state on thread + * exit. This file's wipes are bounded to the short-lived locals -- + * the 44-byte seed buffer and the in-flight keystream chunks. */ #define KSUID_RNG_RESEED_BYTES (1u << 20) /* 1 MiB */ #define KSUID_RNG_RESEED_SECONDS 3600 /* 1 hour */ @@ -75,7 +83,7 @@ ksuid_tls_rng_seed (ksuid_tls_rng_t *r) uint8_t kn[44]; /* 32 key + 12 nonce */ if (ksuid_os_random_bytes (kn, sizeof kn) < 0) { /* Wipe partial seed bytes before returning. */ - memset (kn, 0, sizeof kn); + ksuid_explicit_bzero (kn, sizeof kn); return -1; } r->state[0] = KSUID_CHACHA20_C0; @@ -96,9 +104,13 @@ ksuid_tls_rng_seed (ksuid_tls_rng_t *r) | ((uint32_t) kn[32 + i * 4 + 3] << 24); } /* Wipe seed material from local. The chacha state itself stays in - * TLS, but at least we limit the residue. */ - memset (kn, 0, sizeof kn); + * TLS, but at least we limit the residue from leaving on the + * stack frame after this function returns. */ + ksuid_explicit_bzero (kn, sizeof kn); + /* r->buf is about to be overwritten by the first chacha block; the + * memset is initialisation, not secret-erasure, so plain memset is + * fine here. */ memset (r->buf, 0, sizeof r->buf); r->buf_pos = sizeof r->buf; /* empty buffer, force first block */ r->bytes_emitted = 0; @@ -150,8 +162,10 @@ ksuid_random_bytes (uint8_t *buf, size_t n) size_t chunk = (n < avail) ? n : avail; memcpy (buf, r->buf + r->buf_pos, chunk); /* Wipe consumed keystream to limit forward exposure if memory is - * later inspected. */ - memset (r->buf + r->buf_pos, 0, chunk); + * later inspected. ksuid_explicit_bzero blocks DSE here -- a + * plain memset would be elided by -O2 because the wiped bytes + * are not subsequently read. */ + ksuid_explicit_bzero (r->buf + r->buf_pos, chunk); r->buf_pos += chunk; buf += chunk; n -= chunk; diff --git a/libksuid/wipe.h b/libksuid/wipe.h new file mode 100644 index 0000000..f20f16d --- /dev/null +++ b/libksuid/wipe.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later + * + * DSE-resistant zeroizer. Plain memset(p, 0, n) on a buffer that the + * compiler proves is "dead" (never read again) is allowed -- and at + * -O2 and beyond, encouraged -- to be elided entirely. For sensitive + * material (CSPRNG seed, ChaCha20 internal state, freshly-drawn key + * bytes) that is exactly the wrong outcome. + * + * ksuid_explicit_bzero picks the strongest DSE-immune primitive the + * compile target offers, in this order: + * + * 1. explicit_bzero (glibc 2.25+, MUSL, *BSD, macOS 14.4+) + * Documented to resist optimisation; canonical answer. + * Header lives in on Linux/FreeBSD/NetBSD; + * on OpenBSD/macOS. Two meson probes pick. + * + * 2. SecureZeroMemory (Windows, ) + * MSDN explicitly guarantees the writes are not optimised + * away. Macro over RtlSecureZeroMemory. + * + * 3. memset_s (C11 Annex K) + * Rare; gated behind __STDC_LIB_EXT1__. Required to be + * DSE-immune by the standard. + * + * 4. Portable fallback: indirect call through a `volatile` + * function pointer to memset, followed by a memory-clobber + * barrier. The volatile qualifier on the pointer forces the + * compiler to actually re-read it and emit the call; the + * barrier prevents post-call dead-code analysis from proving + * the writes are unobserved. + * + * The fallback is correct on every C11 toolchain we target but + * empirically a few rungs slower than explicit_bzero / Secure- + * ZeroMemory because it goes through an indirect call. CI on the + * Linux GCC lane runs an objdump grep that fails the build if the + * compiler elided the wipe. + * + * Issue #2 scope: short-lived locals (44-byte seed buffer in + * rand_tls.c, 64-byte keystream chunks, 16-word ChaCha state). + * Long-lived TLS state at thread-exit is issue #4. + */ +#ifndef KSUID_WIPE_H +#define KSUID_WIPE_H + +#include +#include + +/* The KSUID_FORCE_VOLATILE_FALLBACK build flag bypasses every + * platform-specific primitive and forces the indirect-call-through- + * volatile path. It exists so CI can exercise the fallback even on + * a host that has explicit_bzero / SecureZeroMemory available -- + * without it the fallback ships unverified on every supported + * matrix lane. Production builds never set this. */ +#if !defined(KSUID_FORCE_VOLATILE_FALLBACK) +# if defined(KSUID_HAVE_EXPLICIT_BZERO_STRINGS_H) +# include +# elif defined(KSUID_HAVE_EXPLICIT_BZERO_STRING_H) +/* explicit_bzero already pulled in by above. */ +# elif defined(_WIN32) || defined(__CYGWIN__) +# define WIN32_LEAN_AND_MEAN +# include +# elif defined(KSUID_HAVE_MEMSET_S) +/* __STDC_WANT_LIB_EXT1__ is set project-wide by meson when this + * branch is selected -- defining it here would be too late, the + * unconditional include at the top of this header has + * already burned the prototype set without it. */ +# endif +#endif + +static inline void +ksuid_explicit_bzero (void *p, size_t n) +{ + if (p == NULL || n == 0) + return; + +#if !defined(KSUID_FORCE_VOLATILE_FALLBACK) && \ + (defined(KSUID_HAVE_EXPLICIT_BZERO_STRINGS_H) \ + || defined(KSUID_HAVE_EXPLICIT_BZERO_STRING_H)) + explicit_bzero (p, n); +#elif !defined(KSUID_FORCE_VOLATILE_FALLBACK) && \ + (defined(_WIN32) || defined(__CYGWIN__)) + SecureZeroMemory (p, n); +#elif !defined(KSUID_FORCE_VOLATILE_FALLBACK) \ + && defined(KSUID_HAVE_MEMSET_S) + memset_s (p, n, 0, n); +#else + /* Indirect-call-through-volatile fallback. The function pointer + * is volatile-qualified, so the compiler must re-read it and + * emit a real call -- it cannot inline memset and elide the + * stores via DSE. The trailing memory clobber on GCC/Clang + * blocks any post-call dead-code reasoning. + * + * On MSVC 's _ReadWriteBarrier serves the same role, + * but MSVC builds use the SecureZeroMemory branch above so this + * fallback is GCC/Clang in practice. */ + static void *(*const volatile ksuid_memset_v) (void *, int, size_t) = memset; + ksuid_memset_v (p, 0, n); +# if defined(__GNUC__) || defined(__clang__) + __asm__ __volatile__ (""::"r" (p):"memory"); +# endif +#endif +} + +#endif /* KSUID_WIPE_H */ diff --git a/meson.build b/meson.build index 9d94aaf..c2eafdc 100644 --- a/meson.build +++ b/meson.build @@ -44,6 +44,48 @@ if cc.has_function('getentropy', prefix : '#include ') common_args += '-DKSUID_HAVE_GETENTROPY=1' endif +# DSE-resistant wipe primitive (issue #2). The explicit_bzero +# prototype lives in different headers on different platforms: +# modern glibc (2.25+), macOS 14.4+, and OpenBSD declare it in +# ; FreeBSD, NetBSD, and some older glibc / MUSL builds +# put it in . Probe first (the common case) +# and fall back to . +# +# Pass _DEFAULT_SOURCE explicitly because meson does not inherit +# add_project_arguments() into has_function() probes, and glibc +# gates the prototype behind that feature macro. +# +# memset_s is the C11 Annex K third option; rarely shipped. +# Windows uses SecureZeroMemory unconditionally (the macro lives +# in and cc.has_function lies on macros, so detect by +# host system instead). +wipe_backend = 'volatile-fallback' +if cc.has_function('explicit_bzero', + prefix : '#include ', + args : ['-D_DEFAULT_SOURCE']) + common_args += '-DKSUID_HAVE_EXPLICIT_BZERO_STRING_H=1' + wipe_backend = 'explicit_bzero ()' +elif cc.has_function('explicit_bzero', + prefix : '#include ', + args : ['-D_DEFAULT_SOURCE']) + common_args += '-DKSUID_HAVE_EXPLICIT_BZERO_STRINGS_H=1' + wipe_backend = 'explicit_bzero ()' +elif host_machine.system() == 'windows' + wipe_backend = 'SecureZeroMemory' +elif cc.has_function('memset_s', + prefix : '#define __STDC_WANT_LIB_EXT1__ 1\n#include ') + common_args += '-DKSUID_HAVE_MEMSET_S=1' + # Annex K's memset_s prototype is gated behind __STDC_WANT_LIB_EXT1__ + # on every libc that provides it (glibc, Apple libc, MUSL...). + # Defining the opt-in macro project-wide -- not just inside wipe.h + # -- guarantees it is set before is first included by + # ANY translation unit, since wipe.h cannot rewind a prior include + # that already pulled in without the opt-in. + common_args += '-D__STDC_WANT_LIB_EXT1__=1' + wipe_backend = 'memset_s (C11 Annex K)' +endif +summary({'wipe backend' : wipe_backend}, section : 'libksuid') + # Windows: the supported entropy source is BCryptGenRandom. The whole # /dev/urandom + getrandom + getentropy chain doesn't apply, and # rand_os.c switches to a Bcrypt-only path when KSUID_HAVE_BCRYPT is diff --git a/tests/meson.build b/tests/meson.build index b294d70..1e02334 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -11,7 +11,8 @@ threads_dep = dependency('threads', required : false) base_tests = ['test_smoke', 'test_parts', 'test_base62', 'test_parse_format', 'test_sequence', 'test_rand_os', - 'test_chacha20', 'test_new', 'test_simd_parity'] + 'test_chacha20', 'test_new', 'test_simd_parity', + 'test_wipe'] foreach t : base_tests exe = executable(t, t + '.c', diff --git a/tests/test_wipe.c b/tests/test_wipe.c new file mode 100644 index 0000000..0760369 --- /dev/null +++ b/tests/test_wipe.c @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later + * + * Smoke test for libksuid/wipe.h. Proves the shim *zeroes* a buffer. + * Does NOT prove the shim resists dead-store elimination -- that + * property is verified by the objdump grep step in CI's Linux GCC + * lane, which fails the build if the wipe call disappeared from the + * library's optimised disassembly. + */ +#include +#include "test_util.h" + +static void +test_wipe_zeroes_a_full_buffer (void) +{ + uint8_t buf[64]; + for (size_t i = 0; i < sizeof buf; ++i) + buf[i] = (uint8_t) (i ^ 0xa5); + ksuid_explicit_bzero (buf, sizeof buf); + for (size_t i = 0; i < sizeof buf; ++i) + ASSERT_EQ_INT (buf[i], 0); +} + +static void +test_wipe_zeroes_a_subrange (void) +{ + uint8_t buf[16]; + memset (buf, 0xff, sizeof buf); + ksuid_explicit_bzero (buf + 4, 8); + ASSERT_EQ_INT (buf[3], 0xff); + for (size_t i = 4; i < 12; ++i) + ASSERT_EQ_INT (buf[i], 0); + ASSERT_EQ_INT (buf[12], 0xff); +} + +static void +test_wipe_handles_zero_length (void) +{ + uint8_t buf[4] = { 1, 2, 3, 4 }; + ksuid_explicit_bzero (buf, 0); + ASSERT_EQ_INT (buf[0], 1); + ASSERT_EQ_INT (buf[3], 4); +} + +static void +test_wipe_handles_null_pointer (void) +{ + /* Documented contract: the shim is a no-op on (NULL, n) and on + * (p, 0). Neither must crash. */ + ksuid_explicit_bzero (NULL, 0); + ksuid_explicit_bzero (NULL, 64); +} + +int +main (void) +{ + RUN_TEST (test_wipe_zeroes_a_full_buffer); + RUN_TEST (test_wipe_zeroes_a_subrange); + RUN_TEST (test_wipe_handles_zero_length); + RUN_TEST (test_wipe_handles_null_pointer); + TEST_MAIN_END (); +}