feat: redesign health check to triangulate projects.json, issue label, and session state (#143) (#145)

## Changes

- Remove `activeSessions` parameter from health check (was never populated)
- Add gateway session lookup via `openclaw gateway call status`
- Add issue label lookup via `provider.getIssue(issueId)`
- Implement detection matrix with 6 issue types:
  - session_dead: active worker but session missing in gateway
  - label_mismatch: active worker but issue not in Doing/Testing
  - stale_worker: active for >2h
  - stuck_label: inactive but issue has Doing/Testing label
  - orphan_issue_id: inactive but issueId set
  - issue_gone: active but issue deleted/closed

## Files

- lib/services/health.ts — complete rewrite with three-source triangulation
- lib/tools/health.ts — remove activeSessions param, fetch sessions from gateway
- lib/services/heartbeat.ts — remove empty activeSessions calls, pass sessions map
This commit is contained in:
Lauren ten Hoor
2026-02-13 16:20:21 +08:00
committed by GitHub
parent 4a029c1b3b
commit 825c5e6f50
3 changed files with 337 additions and 68 deletions

View File

@@ -1,19 +1,42 @@
/**
* Health service — worker health checks and auto-fix.
*
* Detects: active_no_session, zombie_session, stale_worker, inactive_with_issue.
* Used by both `status` (read-only) and `auto_pickup` (auto-fix).
* Triangulates THREE sources of truth:
* 1. projects.json — worker state (active, issueId, level, sessions)
* 2. Issue label — current GitHub/GitLab label (Doing, Testing, To Do, etc.)
* 3. Session state — whether the OpenClaw session exists via gateway status
*
* Detection matrix:
* | projects.json | Issue label | Session | Action |
* |---------------|-------------------|--------------|-------------------------------------------|
* | active | Doing/Testing ✅ | dead/missing | Deactivate worker, revert to To Do/To Test |
* | active | NOT Doing/Testing | any | Deactivate worker (moved externally) |
* | active | Doing/Testing ✅ | alive | Healthy (flag if stale >2h) |
* | inactive | Doing/Testing | any | Revert issue to To Do/To Test (label stuck)|
* | inactive | issueId set | any | Clear issueId (warning) |
* | active | issue deleted | any | Deactivate worker, clear state |
*/
import type { StateLabel } from "../providers/provider.js";
import type { StateLabel, IssueProvider, Issue } from "../providers/provider.js";
import {
getSessionForLevel,
getWorker,
updateWorker,
type Project,
} from "../projects.js";
import { runCommand } from "../run-command.js";
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
export type HealthIssue = {
type: "active_no_session" | "zombie_session" | "stale_worker" | "inactive_with_issue";
type:
| "session_dead" // Case 1: active worker but session missing/dead
| "label_mismatch" // Case 2: active worker but issue not in Doing/Testing
| "stale_worker" // Case 3: active for >2h
| "stuck_label" // Case 4: inactive but issue still has Doing/Testing
| "orphan_issue_id" // Case 5: inactive but issueId set
| "issue_gone"; // Case 6: active but issue deleted/closed
severity: "critical" | "warning";
project: string;
groupId: string;
@@ -23,6 +46,8 @@ export type HealthIssue = {
sessionKey?: string | null;
hoursActive?: number;
issueId?: string | null;
expectedLabel?: string;
actualLabel?: string | null;
};
export type HealthFix = {
@@ -32,81 +57,328 @@ export type HealthFix = {
labelRevertFailed?: boolean;
};
export type GatewaySession = {
key: string;
updatedAt: number;
percentUsed: number;
abortedLastRun?: boolean;
};
export type SessionLookup = Map<string, GatewaySession>;
// ---------------------------------------------------------------------------
// Gateway session lookup
// ---------------------------------------------------------------------------
/**
* Query gateway status and build a lookup map of active sessions.
* Caches result for the duration of a health check pass.
*/
export async function fetchGatewaySessions(): Promise<SessionLookup> {
const lookup: SessionLookup = new Map();
try {
const result = await runCommand(
["openclaw", "gateway", "call", "status", "--json"],
{ timeoutMs: 15_000 },
);
const data = JSON.parse(result.stdout);
const sessions: GatewaySession[] = data?.sessions?.recent ?? [];
for (const session of sessions) {
if (session.key) {
lookup.set(session.key, session);
}
}
} catch {
// Gateway unavailable — return empty map (all sessions will be treated as missing)
}
return lookup;
}
/**
* Check if a session key exists in the gateway and is considered "alive".
* A session is alive if it exists. We don't consider percentUsed or abortedLastRun
* as dead indicators — those are normal states for reusable sessions.
*/
function isSessionAlive(sessionKey: string, sessions: SessionLookup): boolean {
return sessions.has(sessionKey);
}
// ---------------------------------------------------------------------------
// Issue label lookup
// ---------------------------------------------------------------------------
/**
* Fetch current issue state from the provider.
* Returns null if issue doesn't exist or is inaccessible.
*/
async function fetchIssue(
provider: IssueProvider,
issueId: number,
): Promise<Issue | null> {
try {
return await provider.getIssue(issueId);
} catch {
return null; // Issue deleted, closed, or inaccessible
}
}
// ---------------------------------------------------------------------------
// Health check logic
// ---------------------------------------------------------------------------
/**
* Expected in-progress labels for each role.
*/
const ACTIVE_LABELS: Record<"dev" | "qa", StateLabel> = {
dev: "Doing",
qa: "Testing",
};
/**
* Queue labels to revert to when clearing stuck state.
*/
const QUEUE_LABELS: Record<"dev" | "qa", StateLabel> = {
dev: "To Do",
qa: "To Test",
};
export async function checkWorkerHealth(opts: {
workspaceDir: string;
groupId: string;
project: Project;
role: "dev" | "qa";
activeSessions: string[];
autoFix: boolean;
provider: {
transitionLabel(id: number, from: StateLabel, to: StateLabel): Promise<void>;
};
provider: IssueProvider;
sessions: SessionLookup;
}): Promise<HealthFix[]> {
const { workspaceDir, groupId, project, role, activeSessions, autoFix, provider } = opts;
const { workspaceDir, groupId, project, role, autoFix, provider, sessions } = opts;
const fixes: HealthFix[] = [];
const worker = getWorker(project, role);
const sessionKey = worker.level ? getSessionForLevel(worker, worker.level) : null;
const revertLabel: StateLabel = role === "dev" ? "To Do" : "To Test";
const currentLabel: StateLabel = role === "dev" ? "Doing" : "Testing";
const expectedLabel = ACTIVE_LABELS[role];
const queueLabel = QUEUE_LABELS[role];
async function revertIssueLabel(fix: HealthFix) {
if (!worker.issueId) return;
// Parse issueId (may be comma-separated for batch, take first)
const issueIdNum = worker.issueId ? Number(worker.issueId.split(",")[0]) : null;
// Fetch issue state if we have an issueId
let issue: Issue | null = null;
let currentLabel: StateLabel | null = null;
if (issueIdNum) {
issue = await fetchIssue(provider, issueIdNum);
currentLabel = issue ? provider.getCurrentStateLabel(issue) : null;
}
// Helper to revert label
async function revertLabel(fix: HealthFix, from: StateLabel, to: StateLabel) {
if (!issueIdNum) return;
try {
const id = Number(worker.issueId.split(",")[0]);
await provider.transitionLabel(id, currentLabel, revertLabel);
fix.labelReverted = `${currentLabel}${revertLabel}`;
await provider.transitionLabel(issueIdNum, from, to);
fix.labelReverted = `${from}${to}`;
} catch {
fix.labelRevertFailed = true;
}
}
// Check 1: Active but no session key for current level
// Helper to deactivate worker
async function deactivate(clearSessions = false) {
const updates: Record<string, unknown> = {
active: false,
issueId: null,
startTime: null,
};
if (clearSessions && worker.level) {
updates.sessions = { ...worker.sessions, [worker.level]: null };
}
await updateWorker(workspaceDir, groupId, role, updates);
}
// ---------------------------------------------------------------------------
// Case 6: Active but issue doesn't exist (deleted/closed externally)
// ---------------------------------------------------------------------------
if (worker.active && issueIdNum && !issue) {
const fix: HealthFix = {
issue: {
type: "issue_gone",
severity: "critical",
project: project.name,
groupId,
role,
level: worker.level,
sessionKey,
issueId: worker.issueId,
message: `${role.toUpperCase()} active but issue #${issueIdNum} no longer exists or is closed`,
},
fixed: false,
};
if (autoFix) {
await deactivate(true);
fix.fixed = true;
}
fixes.push(fix);
return fixes; // No point checking further
}
// ---------------------------------------------------------------------------
// Case 2: Active but issue label is NOT the expected in-progress label
// ---------------------------------------------------------------------------
if (worker.active && issue && currentLabel !== expectedLabel) {
const fix: HealthFix = {
issue: {
type: "label_mismatch",
severity: "critical",
project: project.name,
groupId,
role,
level: worker.level,
sessionKey,
issueId: worker.issueId,
expectedLabel,
actualLabel: currentLabel,
message: `${role.toUpperCase()} active but issue #${issueIdNum} has label "${currentLabel}" (expected "${expectedLabel}")`,
},
fixed: false,
};
if (autoFix) {
await deactivate(true);
fix.fixed = true;
}
fixes.push(fix);
return fixes; // State is invalid, don't check session
}
// ---------------------------------------------------------------------------
// Case 1: Active with correct label but session is dead/missing
// ---------------------------------------------------------------------------
if (worker.active && sessionKey && !isSessionAlive(sessionKey, sessions)) {
const fix: HealthFix = {
issue: {
type: "session_dead",
severity: "critical",
project: project.name,
groupId,
role,
sessionKey,
level: worker.level,
issueId: worker.issueId,
message: `${role.toUpperCase()} active but session "${sessionKey}" not found in gateway`,
},
fixed: false,
};
if (autoFix) {
await revertLabel(fix, expectedLabel, queueLabel);
await deactivate(true);
fix.fixed = true;
}
fixes.push(fix);
return fixes;
}
// ---------------------------------------------------------------------------
// Case 1b: Active but no session key at all (shouldn't happen normally)
// ---------------------------------------------------------------------------
if (worker.active && !sessionKey) {
const fix: HealthFix = {
issue: {
type: "active_no_session", severity: "critical",
project: project.name, groupId, role,
type: "session_dead",
severity: "critical",
project: project.name,
groupId,
role,
level: worker.level,
message: `${role.toUpperCase()} active but no session for level "${worker.level}"`,
issueId: worker.issueId,
message: `${role.toUpperCase()} active but no session key for level "${worker.level}"`,
},
fixed: false,
};
if (autoFix) {
await updateWorker(workspaceDir, groupId, role, { active: false, issueId: null, startTime: null });
if (issue && currentLabel === expectedLabel) {
await revertLabel(fix, expectedLabel, queueLabel);
}
await deactivate();
fix.fixed = true;
}
fixes.push(fix);
return fixes;
}
// Check 2: Active with session but session is dead (zombie)
if (worker.active && sessionKey && activeSessions.length > 0 && !activeSessions.includes(sessionKey)) {
// ---------------------------------------------------------------------------
// Case 3: Active with correct label and alive session — check for staleness
// ---------------------------------------------------------------------------
if (worker.active && worker.startTime && sessionKey && isSessionAlive(sessionKey, sessions)) {
const hours = (Date.now() - new Date(worker.startTime).getTime()) / 3_600_000;
if (hours > 2) {
const fix: HealthFix = {
issue: {
type: "stale_worker",
severity: "warning",
project: project.name,
groupId,
role,
hoursActive: Math.round(hours * 10) / 10,
sessionKey,
issueId: worker.issueId,
message: `${role.toUpperCase()} active for ${Math.round(hours * 10) / 10}h — may need attention`,
},
fixed: false,
};
// Stale workers get auto-fixed: revert label and deactivate
if (autoFix) {
await revertLabel(fix, expectedLabel, queueLabel);
await deactivate();
fix.fixed = true;
}
fixes.push(fix);
}
// Otherwise: healthy, no issues to report
}
// ---------------------------------------------------------------------------
// Case 4: Inactive but issue has stuck Doing/Testing label
// ---------------------------------------------------------------------------
if (!worker.active && issue && currentLabel === expectedLabel) {
const fix: HealthFix = {
issue: {
type: "zombie_session", severity: "critical",
project: project.name, groupId, role,
sessionKey, level: worker.level,
message: `${role.toUpperCase()} session not in active sessions list`,
type: "stuck_label",
severity: "critical",
project: project.name,
groupId,
role,
issueId: worker.issueId,
expectedLabel: queueLabel,
actualLabel: currentLabel,
message: `${role.toUpperCase()} inactive but issue #${issueIdNum} still has "${currentLabel}" label`,
},
fixed: false,
};
if (autoFix) {
await revertIssueLabel(fix);
const sessions = { ...worker.sessions };
if (worker.level) sessions[worker.level] = null;
await updateWorker(workspaceDir, groupId, role, { active: false, issueId: null, startTime: null, sessions });
await revertLabel(fix, expectedLabel, queueLabel);
// Also clear the issueId if present
if (worker.issueId) {
await updateWorker(workspaceDir, groupId, role, { issueId: null });
}
fix.fixed = true;
}
fixes.push(fix);
return fixes;
}
// Check 3: Inactive but still has issueId
// ---------------------------------------------------------------------------
// Case 5: Inactive but still has issueId set (orphan reference)
// ---------------------------------------------------------------------------
if (!worker.active && worker.issueId) {
const fix: HealthFix = {
issue: {
type: "inactive_with_issue", severity: "warning",
project: project.name, groupId, role,
type: "orphan_issue_id",
severity: "warning",
project: project.name,
groupId,
role,
issueId: worker.issueId,
message: `${role.toUpperCase()} inactive but still has issueId "${worker.issueId}"`,
},
@@ -119,28 +391,5 @@ export async function checkWorkerHealth(opts: {
fixes.push(fix);
}
// Check 4: Active for >2 hours (stale)
if (worker.active && worker.startTime && sessionKey) {
const hours = (Date.now() - new Date(worker.startTime).getTime()) / 3_600_000;
if (hours > 2) {
const fix: HealthFix = {
issue: {
type: "stale_worker", severity: "warning",
project: project.name, groupId, role,
hoursActive: Math.round(hours * 10) / 10,
sessionKey, issueId: worker.issueId,
message: `${role.toUpperCase()} active for ${Math.round(hours * 10) / 10}h — may need attention`,
},
fixed: false,
};
if (autoFix) {
await revertIssueLabel(fix);
await updateWorker(workspaceDir, groupId, role, { active: false, issueId: null, startTime: null });
fix.fixed = true;
}
fixes.push(fix);
}
}
return fixes;
}