Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 35 additions & 1 deletion packages/k8s/src/k8s/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -784,12 +784,46 @@ export async function waitForPodPhases(
await backOffManager.backOff()
}
} catch (error) {
const warningEvents = await getPodWarningEvents(podName)
const eventsSuffix = warningEvents ? `; events: ${warningEvents}` : ''
throw new Error(
`Pod ${podName} is unhealthy with phase status ${phase}: ${formatError(error)}`
`Pod ${podName} is unhealthy with phase status ${phase}: ${formatError(error)}${eventsSuffix}`
)
}
}

// Best-effort: surface the most recent Warning events for a pod
// (FailedScheduling, FailedMount, etc.) that explain why it never reached a
// healthy phase. These live on the Event resource, not the pod object, and the
// ephemeral workflow pod is usually pruned before an operator can
// `kubectl describe` it — so without this the diagnostic is lost. Never throws:
// it is diagnostic-only and must not shadow the original failure. Reading
// events needs `events: list`; this is intentionally NOT added to
// requiredPermissions (that would hard-fail prepareJob for existing
// least-privilege deployments), so a 403 here is swallowed and simply yields no
// extra detail.
async function getPodWarningEvents(podName: string): Promise<string> {
try {
const { items } = await k8sApi.listNamespacedEvent({
namespace: namespace(),
fieldSelector: `involvedObject.name=${podName},type=Warning`
})
return (items ?? [])
.map(e => ({
reason: e.reason,
message: e.message,
when: new Date(e.lastTimestamp ?? e.eventTime ?? 0).getTime()
}))
.sort((a, b) => b.when - a.when)
.slice(0, 3)
.map(e => `[${e.reason}] ${e.message}`)
.join('; ')
} catch (err) {
core.debug(`Could not list events for pod ${podName}: ${formatError(err)}`)
return ''
}
}

export function getPrepareJobTimeoutSeconds(): number {
const envTimeoutSeconds =
process.env['ACTIONS_RUNNER_PREPARE_JOB_TIMEOUT_SECONDS']
Expand Down
171 changes: 171 additions & 0 deletions packages/k8s/tests/pod-events-test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
const mockReadNamespacedPod = jest.fn()
const mockListNamespacedEvent = jest.fn()

jest.mock('@kubernetes/client-node', () => {
return {
KubeConfig: jest.fn().mockImplementation(() => ({
loadFromDefault: jest.fn(),
makeApiClient: jest.fn().mockImplementation(ApiClass => {
const name = ApiClass?.name || ApiClass?.toString() || ''
if (name.includes('Batch')) {
return { readNamespacedJob: jest.fn() }
}
if (name.includes('Authorization')) {
return { createSelfSubjectAccessReview: jest.fn() }
}
return {
readNamespacedPod: mockReadNamespacedPod,
listNamespacedEvent: mockListNamespacedEvent
}
}),
getContexts: jest.fn().mockReturnValue([{ namespace: 'test-namespace' }])
})),
Exec: jest.fn().mockImplementation(() => ({ exec: jest.fn() })),
// eslint-disable-next-line @typescript-eslint/no-extraneous-class
CoreV1Api: class CoreV1Api {},
// eslint-disable-next-line @typescript-eslint/no-extraneous-class
BatchV1Api: class BatchV1Api {},
// eslint-disable-next-line @typescript-eslint/no-extraneous-class
AuthorizationV1Api: class AuthorizationV1Api {},
Log: jest.fn()
}
})

jest.mock('tar-fs', () => ({
default: {
pack: jest.fn().mockReturnValue({ pipe: jest.fn() }),
extract: jest.fn().mockReturnValue({
on: jest.fn(),
pipe: jest.fn()
})
},
__esModule: true
}))

import { waitForPodPhases } from '../src/k8s'
import { PodPhase } from '../src/k8s/utils'

// awaiting RUNNING / backing-off PENDING mirrors the real prepare-job call.
// A pod reported in any other phase falls straight into the unhealthy-throw
// path, which is the catch block that enriches the error with Warning events.
const awaitingPhases = (): Set<PodPhase> => new Set([PodPhase.RUNNING])
const backOffPhases = (): Set<PodPhase> => new Set([PodPhase.PENDING])

describe('waitForPodPhases Warning event enrichment', () => {
beforeEach(() => {
jest.clearAllMocks()
process.env['ACTIONS_RUNNER_KUBERNETES_NAMESPACE'] = 'test-namespace'
})

afterEach(() => {
delete process.env['ACTIONS_RUNNER_KUBERNETES_NAMESPACE']
})

it('appends recent Warning events to the thrown error', async () => {
mockReadNamespacedPod.mockResolvedValue({ status: { phase: 'Failed' } })
mockListNamespacedEvent.mockResolvedValue({
items: [
{
reason: 'FailedScheduling',
message: '0/3 nodes are available: 3 Too many pods.',
lastTimestamp: new Date('2026-06-05T10:00:00Z'),
type: 'Warning'
}
]
})

await expect(
waitForPodPhases('test-pod', awaitingPhases(), backOffPhases())
).rejects.toThrow(
'events: [FailedScheduling] 0/3 nodes are available: 3 Too many pods.'
)
})

it('queries Warning events scoped to the pod in the right namespace', async () => {
mockReadNamespacedPod.mockResolvedValue({ status: { phase: 'Failed' } })
mockListNamespacedEvent.mockResolvedValue({ items: [] })

await expect(
waitForPodPhases('test-pod', awaitingPhases(), backOffPhases())
).rejects.toThrow(/unhealthy with phase status Failed/)

expect(mockListNamespacedEvent).toHaveBeenCalledWith({
namespace: 'test-namespace',
fieldSelector: 'involvedObject.name=test-pod,type=Warning'
})
})

it('keeps only the 3 most recent events, newest first', async () => {
mockReadNamespacedPod.mockResolvedValue({ status: { phase: 'Failed' } })
mockListNamespacedEvent.mockResolvedValue({
items: [
{
reason: 'Oldest',
message: 'm0',
lastTimestamp: new Date('2026-06-05T10:00:00Z')
},
{
reason: 'Newest',
message: 'm3',
lastTimestamp: new Date('2026-06-05T10:03:00Z')
},
{
reason: 'Middle',
message: 'm1',
// exercise the eventTime fallback when lastTimestamp is absent
eventTime: new Date('2026-06-05T10:01:00Z')
},
{
reason: 'Later',
message: 'm2',
lastTimestamp: new Date('2026-06-05T10:02:00Z')
}
]
})

let message = ''
try {
await waitForPodPhases('test-pod', awaitingPhases(), backOffPhases())
} catch (error) {
message = (error as Error).message
}

expect(message).toContain('events: [Newest] m3; [Later] m2; [Middle] m1')
expect(message).not.toContain('Oldest')
})

it('does not append an events section when there are none', async () => {
mockReadNamespacedPod.mockResolvedValue({ status: { phase: 'Failed' } })
mockListNamespacedEvent.mockResolvedValue({ items: [] })

let message = ''
try {
await waitForPodPhases('test-pod', awaitingPhases(), backOffPhases())
} catch (error) {
message = (error as Error).message
}

expect(message).toContain('unhealthy with phase status Failed')
expect(message).not.toContain('events:')
})

it('is best-effort: a failed event lookup never shadows the original error', async () => {
mockReadNamespacedPod.mockRejectedValue(new Error('network timeout'))
mockListNamespacedEvent.mockRejectedValue(
new Error('events is forbidden: User cannot list events')
)

let message = ''
try {
await waitForPodPhases('test-pod', awaitingPhases(), backOffPhases())
} catch (error) {
message = (error as Error).message
}

expect(message).toContain(
'Pod test-pod is unhealthy with phase status Unknown: network timeout'
)
expect(message).not.toContain('events:')
expect(message).not.toContain('forbidden')
})
})