fix: health check skips session liveness checks when gateway unavailable (#165)

This commit is contained in:
Lauren ten Hoor
2026-02-13 19:40:43 +08:00
parent 19a2cb3bfc
commit 8b5cfedb01

View File

@@ -79,9 +79,10 @@ export type SessionLookup = Map<string, GatewaySession>;
/**
* Query gateway status and build a lookup map of active sessions.
* Caches result for the duration of a health check pass.
* Returns null if gateway is unavailable (timeout, error, etc).
* Callers should skip session liveness checks if null — unknown ≠ dead.
*/
export async function fetchGatewaySessions(): Promise<SessionLookup> {
export async function fetchGatewaySessions(): Promise<SessionLookup | null> {
const lookup: SessionLookup = new Map();
try {
@@ -98,20 +99,21 @@ export async function fetchGatewaySessions(): Promise<SessionLookup> {
lookup.set(session.key, session);
}
}
} catch {
// Gateway unavailable — return empty map (all sessions will be treated as missing)
}
return lookup;
} catch {
// Gateway unavailable — return null (don't assume sessions are dead)
return null;
}
}
/**
* Check if a session key exists in the gateway and is considered "alive".
* A session is alive if it exists. We don't consider percentUsed or abortedLastRun
* as dead indicators — those are normal states for reusable sessions.
* Returns false if sessions lookup is null (gateway unavailable).
*/
function isSessionAlive(sessionKey: string, sessions: SessionLookup): boolean {
return sessions.has(sessionKey);
function isSessionAlive(sessionKey: string, sessions: SessionLookup | null): boolean {
return sessions ? sessions.has(sessionKey) : false;
}
// ---------------------------------------------------------------------------
@@ -144,7 +146,7 @@ export async function checkWorkerHealth(opts: {
role: Role;
autoFix: boolean;
provider: IssueProvider;
sessions: SessionLookup;
sessions: SessionLookup | null;
/** Workflow config (defaults to DEFAULT_WORKFLOW) */
workflow?: WorkflowConfig;
}): Promise<HealthFix[]> {
@@ -252,8 +254,9 @@ export async function checkWorkerHealth(opts: {
// ---------------------------------------------------------------------------
// Case 1: Active with correct label but session is dead/missing
// Skip if sessions lookup unavailable (gateway timeout) — unknown ≠ dead
// ---------------------------------------------------------------------------
if (worker.active && sessionKey && !isSessionAlive(sessionKey, sessions)) {
if (worker.active && sessionKey && sessions && !isSessionAlive(sessionKey, sessions)) {
const fix: HealthFix = {
issue: {
type: "session_dead",
@@ -307,8 +310,9 @@ export async function checkWorkerHealth(opts: {
// ---------------------------------------------------------------------------
// Case 3: Active with correct label and alive session — check for staleness
// Skip if sessions lookup unavailable (gateway timeout)
// ---------------------------------------------------------------------------
if (worker.active && worker.startTime && sessionKey && isSessionAlive(sessionKey, sessions)) {
if (worker.active && worker.startTime && sessionKey && sessions && isSessionAlive(sessionKey, sessions)) {
const hours = (Date.now() - new Date(worker.startTime).getTime()) / 3_600_000;
if (hours > 2) {
const fix: HealthFix = {