From 92525c76ab02f3c168c606a5b6e78733e3dc9d0a Mon Sep 17 00:00:00 2001 From: Remi DEBETTE Date: Fri, 5 Jun 2026 13:31:10 +0200 Subject: [PATCH] feat(k8s): surface pod Warning events on waitForPodPhases timeout When a workflow pod never reaches Running for a reason that lives on the K8s Event resource (FailedScheduling "Too many pods" / "Insufficient cpu" / untolerated taints, FailedMount, ...) rather than on the pod object, the hook previously surfaced only a generic timeout. The ephemeral pod is usually pruned before an operator can `kubectl describe` it, so the diagnostic was lost. waitForPodPhases now makes a best-effort fetch of the pod's most recent Warning events in its catch path and appends up to the 3 newest to the thrown error, e.g.: Pod foo is unhealthy with phase status Pending: backoff timeout; events: [FailedScheduling] 0/3 nodes are available: 3 Too many pods. The fetch never throws: it must not shadow the original failure, and reading events needs `events: list` which is intentionally NOT added to requiredPermissions (doing so would hard-fail prepareJob for existing least-privilege deployments). A 403 is swallowed and simply yields no extra detail. Third piece of the k8s error-surfacing work after #341 and #364; complements #336 (container waiting reasons, a different resource). Implements #366 Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/k8s/src/k8s/index.ts | 36 +++++- packages/k8s/tests/pod-events-test.ts | 171 ++++++++++++++++++++++++++ 2 files changed, 206 insertions(+), 1 deletion(-) create mode 100644 packages/k8s/tests/pod-events-test.ts diff --git a/packages/k8s/src/k8s/index.ts b/packages/k8s/src/k8s/index.ts index beaee808..ebe6f0b5 100644 --- a/packages/k8s/src/k8s/index.ts +++ b/packages/k8s/src/k8s/index.ts @@ -784,12 +784,46 @@ export async function waitForPodPhases( await backOffManager.backOff() } } catch (error) { + const warningEvents = await getPodWarningEvents(podName) + const eventsSuffix = warningEvents ? `; events: ${warningEvents}` : '' throw new Error( - `Pod ${podName} is unhealthy with phase status ${phase}: ${formatError(error)}` + `Pod ${podName} is unhealthy with phase status ${phase}: ${formatError(error)}${eventsSuffix}` ) } } +// Best-effort: surface the most recent Warning events for a pod +// (FailedScheduling, FailedMount, etc.) that explain why it never reached a +// healthy phase. These live on the Event resource, not the pod object, and the +// ephemeral workflow pod is usually pruned before an operator can +// `kubectl describe` it — so without this the diagnostic is lost. Never throws: +// it is diagnostic-only and must not shadow the original failure. Reading +// events needs `events: list`; this is intentionally NOT added to +// requiredPermissions (that would hard-fail prepareJob for existing +// least-privilege deployments), so a 403 here is swallowed and simply yields no +// extra detail. +async function getPodWarningEvents(podName: string): Promise { + try { + const { items } = await k8sApi.listNamespacedEvent({ + namespace: namespace(), + fieldSelector: `involvedObject.name=${podName},type=Warning` + }) + return (items ?? []) + .map(e => ({ + reason: e.reason, + message: e.message, + when: new Date(e.lastTimestamp ?? e.eventTime ?? 0).getTime() + })) + .sort((a, b) => b.when - a.when) + .slice(0, 3) + .map(e => `[${e.reason}] ${e.message}`) + .join('; ') + } catch (err) { + core.debug(`Could not list events for pod ${podName}: ${formatError(err)}`) + return '' + } +} + export function getPrepareJobTimeoutSeconds(): number { const envTimeoutSeconds = process.env['ACTIONS_RUNNER_PREPARE_JOB_TIMEOUT_SECONDS'] diff --git a/packages/k8s/tests/pod-events-test.ts b/packages/k8s/tests/pod-events-test.ts new file mode 100644 index 00000000..164acc57 --- /dev/null +++ b/packages/k8s/tests/pod-events-test.ts @@ -0,0 +1,171 @@ +const mockReadNamespacedPod = jest.fn() +const mockListNamespacedEvent = jest.fn() + +jest.mock('@kubernetes/client-node', () => { + return { + KubeConfig: jest.fn().mockImplementation(() => ({ + loadFromDefault: jest.fn(), + makeApiClient: jest.fn().mockImplementation(ApiClass => { + const name = ApiClass?.name || ApiClass?.toString() || '' + if (name.includes('Batch')) { + return { readNamespacedJob: jest.fn() } + } + if (name.includes('Authorization')) { + return { createSelfSubjectAccessReview: jest.fn() } + } + return { + readNamespacedPod: mockReadNamespacedPod, + listNamespacedEvent: mockListNamespacedEvent + } + }), + getContexts: jest.fn().mockReturnValue([{ namespace: 'test-namespace' }]) + })), + Exec: jest.fn().mockImplementation(() => ({ exec: jest.fn() })), + // eslint-disable-next-line @typescript-eslint/no-extraneous-class + CoreV1Api: class CoreV1Api {}, + // eslint-disable-next-line @typescript-eslint/no-extraneous-class + BatchV1Api: class BatchV1Api {}, + // eslint-disable-next-line @typescript-eslint/no-extraneous-class + AuthorizationV1Api: class AuthorizationV1Api {}, + Log: jest.fn() + } +}) + +jest.mock('tar-fs', () => ({ + default: { + pack: jest.fn().mockReturnValue({ pipe: jest.fn() }), + extract: jest.fn().mockReturnValue({ + on: jest.fn(), + pipe: jest.fn() + }) + }, + __esModule: true +})) + +import { waitForPodPhases } from '../src/k8s' +import { PodPhase } from '../src/k8s/utils' + +// awaiting RUNNING / backing-off PENDING mirrors the real prepare-job call. +// A pod reported in any other phase falls straight into the unhealthy-throw +// path, which is the catch block that enriches the error with Warning events. +const awaitingPhases = (): Set => new Set([PodPhase.RUNNING]) +const backOffPhases = (): Set => new Set([PodPhase.PENDING]) + +describe('waitForPodPhases Warning event enrichment', () => { + beforeEach(() => { + jest.clearAllMocks() + process.env['ACTIONS_RUNNER_KUBERNETES_NAMESPACE'] = 'test-namespace' + }) + + afterEach(() => { + delete process.env['ACTIONS_RUNNER_KUBERNETES_NAMESPACE'] + }) + + it('appends recent Warning events to the thrown error', async () => { + mockReadNamespacedPod.mockResolvedValue({ status: { phase: 'Failed' } }) + mockListNamespacedEvent.mockResolvedValue({ + items: [ + { + reason: 'FailedScheduling', + message: '0/3 nodes are available: 3 Too many pods.', + lastTimestamp: new Date('2026-06-05T10:00:00Z'), + type: 'Warning' + } + ] + }) + + await expect( + waitForPodPhases('test-pod', awaitingPhases(), backOffPhases()) + ).rejects.toThrow( + 'events: [FailedScheduling] 0/3 nodes are available: 3 Too many pods.' + ) + }) + + it('queries Warning events scoped to the pod in the right namespace', async () => { + mockReadNamespacedPod.mockResolvedValue({ status: { phase: 'Failed' } }) + mockListNamespacedEvent.mockResolvedValue({ items: [] }) + + await expect( + waitForPodPhases('test-pod', awaitingPhases(), backOffPhases()) + ).rejects.toThrow(/unhealthy with phase status Failed/) + + expect(mockListNamespacedEvent).toHaveBeenCalledWith({ + namespace: 'test-namespace', + fieldSelector: 'involvedObject.name=test-pod,type=Warning' + }) + }) + + it('keeps only the 3 most recent events, newest first', async () => { + mockReadNamespacedPod.mockResolvedValue({ status: { phase: 'Failed' } }) + mockListNamespacedEvent.mockResolvedValue({ + items: [ + { + reason: 'Oldest', + message: 'm0', + lastTimestamp: new Date('2026-06-05T10:00:00Z') + }, + { + reason: 'Newest', + message: 'm3', + lastTimestamp: new Date('2026-06-05T10:03:00Z') + }, + { + reason: 'Middle', + message: 'm1', + // exercise the eventTime fallback when lastTimestamp is absent + eventTime: new Date('2026-06-05T10:01:00Z') + }, + { + reason: 'Later', + message: 'm2', + lastTimestamp: new Date('2026-06-05T10:02:00Z') + } + ] + }) + + let message = '' + try { + await waitForPodPhases('test-pod', awaitingPhases(), backOffPhases()) + } catch (error) { + message = (error as Error).message + } + + expect(message).toContain('events: [Newest] m3; [Later] m2; [Middle] m1') + expect(message).not.toContain('Oldest') + }) + + it('does not append an events section when there are none', async () => { + mockReadNamespacedPod.mockResolvedValue({ status: { phase: 'Failed' } }) + mockListNamespacedEvent.mockResolvedValue({ items: [] }) + + let message = '' + try { + await waitForPodPhases('test-pod', awaitingPhases(), backOffPhases()) + } catch (error) { + message = (error as Error).message + } + + expect(message).toContain('unhealthy with phase status Failed') + expect(message).not.toContain('events:') + }) + + it('is best-effort: a failed event lookup never shadows the original error', async () => { + mockReadNamespacedPod.mockRejectedValue(new Error('network timeout')) + mockListNamespacedEvent.mockRejectedValue( + new Error('events is forbidden: User cannot list events') + ) + + let message = '' + try { + await waitForPodPhases('test-pod', awaitingPhases(), backOffPhases()) + } catch (error) { + message = (error as Error).message + } + + expect(message).toContain( + 'Pod test-pod is unhealthy with phase status Unknown: network timeout' + ) + expect(message).not.toContain('events:') + expect(message).not.toContain('forbidden') + }) +})