feat: redesign health check to triangulate projects.json, issue label, and session state (#143) (#145)

## Changes

- Remove `activeSessions` parameter from health check (was never populated)
- Add gateway session lookup via `openclaw gateway call status`
- Add issue label lookup via `provider.getIssue(issueId)`
- Implement detection matrix with 6 issue types:
  - session_dead: active worker but session missing in gateway
  - label_mismatch: active worker but issue not in Doing/Testing
  - stale_worker: active for >2h
  - stuck_label: inactive but issue has Doing/Testing label
  - orphan_issue_id: inactive but issueId set
  - issue_gone: active but issue deleted/closed

## Files

- lib/services/health.ts — complete rewrite with three-source triangulation
- lib/tools/health.ts — remove activeSessions param, fetch sessions from gateway
- lib/services/heartbeat.ts — remove empty activeSessions calls, pass sessions map
This commit is contained in:
Lauren ten Hoor
2026-02-13 16:20:21 +08:00
committed by GitHub
parent 4a029c1b3b
commit 825c5e6f50
3 changed files with 337 additions and 68 deletions

View File

@@ -1,19 +1,42 @@
/**
* Health service — worker health checks and auto-fix.
*
* Detects: active_no_session, zombie_session, stale_worker, inactive_with_issue.
* Used by both `status` (read-only) and `auto_pickup` (auto-fix).
* Triangulates THREE sources of truth:
* 1. projects.json — worker state (active, issueId, level, sessions)
* 2. Issue label — current GitHub/GitLab label (Doing, Testing, To Do, etc.)
* 3. Session state — whether the OpenClaw session exists via gateway status
*
* Detection matrix:
* | projects.json | Issue label | Session | Action |
* |---------------|-------------------|--------------|-------------------------------------------|
* | active | Doing/Testing ✅ | dead/missing | Deactivate worker, revert to To Do/To Test |
* | active | NOT Doing/Testing | any | Deactivate worker (moved externally) |
* | active | Doing/Testing ✅ | alive | Healthy (flag if stale >2h) |
* | inactive | Doing/Testing | any | Revert issue to To Do/To Test (label stuck)|
* | inactive | issueId set | any | Clear issueId (warning) |
* | active | issue deleted | any | Deactivate worker, clear state |
*/
import type { StateLabel } from "../providers/provider.js";
import type { StateLabel, IssueProvider, Issue } from "../providers/provider.js";
import {
getSessionForLevel,
getWorker,
updateWorker,
type Project,
} from "../projects.js";
import { runCommand } from "../run-command.js";
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
export type HealthIssue = {
type: "active_no_session" | "zombie_session" | "stale_worker" | "inactive_with_issue";
type:
| "session_dead" // Case 1: active worker but session missing/dead
| "label_mismatch" // Case 2: active worker but issue not in Doing/Testing
| "stale_worker" // Case 3: active for >2h
| "stuck_label" // Case 4: inactive but issue still has Doing/Testing
| "orphan_issue_id" // Case 5: inactive but issueId set
| "issue_gone"; // Case 6: active but issue deleted/closed
severity: "critical" | "warning";
project: string;
groupId: string;
@@ -23,6 +46,8 @@ export type HealthIssue = {
sessionKey?: string | null;
hoursActive?: number;
issueId?: string | null;
expectedLabel?: string;
actualLabel?: string | null;
};
export type HealthFix = {
@@ -32,81 +57,328 @@ export type HealthFix = {
labelRevertFailed?: boolean;
};
export type GatewaySession = {
key: string;
updatedAt: number;
percentUsed: number;
abortedLastRun?: boolean;
};
export type SessionLookup = Map<string, GatewaySession>;
// ---------------------------------------------------------------------------
// Gateway session lookup
// ---------------------------------------------------------------------------
/**
* Query gateway status and build a lookup map of active sessions.
* Caches result for the duration of a health check pass.
*/
export async function fetchGatewaySessions(): Promise<SessionLookup> {
const lookup: SessionLookup = new Map();
try {
const result = await runCommand(
["openclaw", "gateway", "call", "status", "--json"],
{ timeoutMs: 15_000 },
);
const data = JSON.parse(result.stdout);
const sessions: GatewaySession[] = data?.sessions?.recent ?? [];
for (const session of sessions) {
if (session.key) {
lookup.set(session.key, session);
}
}
} catch {
// Gateway unavailable — return empty map (all sessions will be treated as missing)
}
return lookup;
}
/**
* Check if a session key exists in the gateway and is considered "alive".
* A session is alive if it exists. We don't consider percentUsed or abortedLastRun
* as dead indicators — those are normal states for reusable sessions.
*/
function isSessionAlive(sessionKey: string, sessions: SessionLookup): boolean {
return sessions.has(sessionKey);
}
// ---------------------------------------------------------------------------
// Issue label lookup
// ---------------------------------------------------------------------------
/**
* Fetch current issue state from the provider.
* Returns null if issue doesn't exist or is inaccessible.
*/
async function fetchIssue(
provider: IssueProvider,
issueId: number,
): Promise<Issue | null> {
try {
return await provider.getIssue(issueId);
} catch {
return null; // Issue deleted, closed, or inaccessible
}
}
// ---------------------------------------------------------------------------
// Health check logic
// ---------------------------------------------------------------------------
/**
* Expected in-progress labels for each role.
*/
const ACTIVE_LABELS: Record<"dev" | "qa", StateLabel> = {
dev: "Doing",
qa: "Testing",
};
/**
* Queue labels to revert to when clearing stuck state.
*/
const QUEUE_LABELS: Record<"dev" | "qa", StateLabel> = {
dev: "To Do",
qa: "To Test",
};
export async function checkWorkerHealth(opts: {
workspaceDir: string;
groupId: string;
project: Project;
role: "dev" | "qa";
activeSessions: string[];
autoFix: boolean;
provider: {
transitionLabel(id: number, from: StateLabel, to: StateLabel): Promise<void>;
};
provider: IssueProvider;
sessions: SessionLookup;
}): Promise<HealthFix[]> {
const { workspaceDir, groupId, project, role, activeSessions, autoFix, provider } = opts;
const { workspaceDir, groupId, project, role, autoFix, provider, sessions } = opts;
const fixes: HealthFix[] = [];
const worker = getWorker(project, role);
const sessionKey = worker.level ? getSessionForLevel(worker, worker.level) : null;
const revertLabel: StateLabel = role === "dev" ? "To Do" : "To Test";
const currentLabel: StateLabel = role === "dev" ? "Doing" : "Testing";
const expectedLabel = ACTIVE_LABELS[role];
const queueLabel = QUEUE_LABELS[role];
async function revertIssueLabel(fix: HealthFix) {
if (!worker.issueId) return;
// Parse issueId (may be comma-separated for batch, take first)
const issueIdNum = worker.issueId ? Number(worker.issueId.split(",")[0]) : null;
// Fetch issue state if we have an issueId
let issue: Issue | null = null;
let currentLabel: StateLabel | null = null;
if (issueIdNum) {
issue = await fetchIssue(provider, issueIdNum);
currentLabel = issue ? provider.getCurrentStateLabel(issue) : null;
}
// Helper to revert label
async function revertLabel(fix: HealthFix, from: StateLabel, to: StateLabel) {
if (!issueIdNum) return;
try {
const id = Number(worker.issueId.split(",")[0]);
await provider.transitionLabel(id, currentLabel, revertLabel);
fix.labelReverted = `${currentLabel}${revertLabel}`;
await provider.transitionLabel(issueIdNum, from, to);
fix.labelReverted = `${from}${to}`;
} catch {
fix.labelRevertFailed = true;
}
}
// Check 1: Active but no session key for current level
// Helper to deactivate worker
async function deactivate(clearSessions = false) {
const updates: Record<string, unknown> = {
active: false,
issueId: null,
startTime: null,
};
if (clearSessions && worker.level) {
updates.sessions = { ...worker.sessions, [worker.level]: null };
}
await updateWorker(workspaceDir, groupId, role, updates);
}
// ---------------------------------------------------------------------------
// Case 6: Active but issue doesn't exist (deleted/closed externally)
// ---------------------------------------------------------------------------
if (worker.active && issueIdNum && !issue) {
const fix: HealthFix = {
issue: {
type: "issue_gone",
severity: "critical",
project: project.name,
groupId,
role,
level: worker.level,
sessionKey,
issueId: worker.issueId,
message: `${role.toUpperCase()} active but issue #${issueIdNum} no longer exists or is closed`,
},
fixed: false,
};
if (autoFix) {
await deactivate(true);
fix.fixed = true;
}
fixes.push(fix);
return fixes; // No point checking further
}
// ---------------------------------------------------------------------------
// Case 2: Active but issue label is NOT the expected in-progress label
// ---------------------------------------------------------------------------
if (worker.active && issue && currentLabel !== expectedLabel) {
const fix: HealthFix = {
issue: {
type: "label_mismatch",
severity: "critical",
project: project.name,
groupId,
role,
level: worker.level,
sessionKey,
issueId: worker.issueId,
expectedLabel,
actualLabel: currentLabel,
message: `${role.toUpperCase()} active but issue #${issueIdNum} has label "${currentLabel}" (expected "${expectedLabel}")`,
},
fixed: false,
};
if (autoFix) {
await deactivate(true);
fix.fixed = true;
}
fixes.push(fix);
return fixes; // State is invalid, don't check session
}
// ---------------------------------------------------------------------------
// Case 1: Active with correct label but session is dead/missing
// ---------------------------------------------------------------------------
if (worker.active && sessionKey && !isSessionAlive(sessionKey, sessions)) {
const fix: HealthFix = {
issue: {
type: "session_dead",
severity: "critical",
project: project.name,
groupId,
role,
sessionKey,
level: worker.level,
issueId: worker.issueId,
message: `${role.toUpperCase()} active but session "${sessionKey}" not found in gateway`,
},
fixed: false,
};
if (autoFix) {
await revertLabel(fix, expectedLabel, queueLabel);
await deactivate(true);
fix.fixed = true;
}
fixes.push(fix);
return fixes;
}
// ---------------------------------------------------------------------------
// Case 1b: Active but no session key at all (shouldn't happen normally)
// ---------------------------------------------------------------------------
if (worker.active && !sessionKey) {
const fix: HealthFix = {
issue: {
type: "active_no_session", severity: "critical",
project: project.name, groupId, role,
type: "session_dead",
severity: "critical",
project: project.name,
groupId,
role,
level: worker.level,
message: `${role.toUpperCase()} active but no session for level "${worker.level}"`,
issueId: worker.issueId,
message: `${role.toUpperCase()} active but no session key for level "${worker.level}"`,
},
fixed: false,
};
if (autoFix) {
await updateWorker(workspaceDir, groupId, role, { active: false, issueId: null, startTime: null });
if (issue && currentLabel === expectedLabel) {
await revertLabel(fix, expectedLabel, queueLabel);
}
await deactivate();
fix.fixed = true;
}
fixes.push(fix);
return fixes;
}
// Check 2: Active with session but session is dead (zombie)
if (worker.active && sessionKey && activeSessions.length > 0 && !activeSessions.includes(sessionKey)) {
// ---------------------------------------------------------------------------
// Case 3: Active with correct label and alive session — check for staleness
// ---------------------------------------------------------------------------
if (worker.active && worker.startTime && sessionKey && isSessionAlive(sessionKey, sessions)) {
const hours = (Date.now() - new Date(worker.startTime).getTime()) / 3_600_000;
if (hours > 2) {
const fix: HealthFix = {
issue: {
type: "zombie_session", severity: "critical",
project: project.name, groupId, role,
sessionKey, level: worker.level,
message: `${role.toUpperCase()} session not in active sessions list`,
type: "stale_worker",
severity: "warning",
project: project.name,
groupId,
role,
hoursActive: Math.round(hours * 10) / 10,
sessionKey,
issueId: worker.issueId,
message: `${role.toUpperCase()} active for ${Math.round(hours * 10) / 10}h — may need attention`,
},
fixed: false,
};
// Stale workers get auto-fixed: revert label and deactivate
if (autoFix) {
await revertIssueLabel(fix);
const sessions = { ...worker.sessions };
if (worker.level) sessions[worker.level] = null;
await updateWorker(workspaceDir, groupId, role, { active: false, issueId: null, startTime: null, sessions });
await revertLabel(fix, expectedLabel, queueLabel);
await deactivate();
fix.fixed = true;
}
fixes.push(fix);
}
// Otherwise: healthy, no issues to report
}
// Check 3: Inactive but still has issueId
// ---------------------------------------------------------------------------
// Case 4: Inactive but issue has stuck Doing/Testing label
// ---------------------------------------------------------------------------
if (!worker.active && issue && currentLabel === expectedLabel) {
const fix: HealthFix = {
issue: {
type: "stuck_label",
severity: "critical",
project: project.name,
groupId,
role,
issueId: worker.issueId,
expectedLabel: queueLabel,
actualLabel: currentLabel,
message: `${role.toUpperCase()} inactive but issue #${issueIdNum} still has "${currentLabel}" label`,
},
fixed: false,
};
if (autoFix) {
await revertLabel(fix, expectedLabel, queueLabel);
// Also clear the issueId if present
if (worker.issueId) {
await updateWorker(workspaceDir, groupId, role, { issueId: null });
}
fix.fixed = true;
}
fixes.push(fix);
return fixes;
}
// ---------------------------------------------------------------------------
// Case 5: Inactive but still has issueId set (orphan reference)
// ---------------------------------------------------------------------------
if (!worker.active && worker.issueId) {
const fix: HealthFix = {
issue: {
type: "inactive_with_issue", severity: "warning",
project: project.name, groupId, role,
type: "orphan_issue_id",
severity: "warning",
project: project.name,
groupId,
role,
issueId: worker.issueId,
message: `${role.toUpperCase()} inactive but still has issueId "${worker.issueId}"`,
},
@@ -119,28 +391,5 @@ export async function checkWorkerHealth(opts: {
fixes.push(fix);
}
// Check 4: Active for >2 hours (stale)
if (worker.active && worker.startTime && sessionKey) {
const hours = (Date.now() - new Date(worker.startTime).getTime()) / 3_600_000;
if (hours > 2) {
const fix: HealthFix = {
issue: {
type: "stale_worker", severity: "warning",
project: project.name, groupId, role,
hoursActive: Math.round(hours * 10) / 10,
sessionKey, issueId: worker.issueId,
message: `${role.toUpperCase()} active for ${Math.round(hours * 10) / 10}h — may need attention`,
},
fixed: false,
};
if (autoFix) {
await revertIssueLabel(fix);
await updateWorker(workspaceDir, groupId, role, { active: false, issueId: null, startTime: null });
fix.fixed = true;
}
fixes.push(fix);
}
}
return fixes;
}

View File

@@ -15,7 +15,7 @@ import fs from "node:fs";
import path from "node:path";
import { readProjects } from "../projects.js";
import { log as auditLog } from "../audit.js";
import { checkWorkerHealth } from "./health.js";
import { checkWorkerHealth, fetchGatewaySessions, type SessionLookup } from "./health.js";
import { projectTick } from "./tick.js";
import { createProvider } from "../providers/index.js";
import { notifyTickPickups, getNotificationConfig } from "../notify.js";
@@ -184,12 +184,16 @@ async function processAllAgents(
totalSkipped: 0,
};
// Fetch gateway sessions once for all agents/projects
const sessions = await fetchGatewaySessions();
for (const { agentId, workspace } of agents) {
const agentResult = await tick({
workspaceDir: workspace,
agentId,
config,
pluginConfig,
sessions,
logger,
});
@@ -221,9 +225,10 @@ export async function tick(opts: {
agentId?: string;
config: HeartbeatConfig;
pluginConfig?: Record<string, unknown>;
sessions: SessionLookup;
logger: { info(msg: string): void; warn(msg: string): void };
}): Promise<TickResult> {
const { workspaceDir, agentId, config, pluginConfig } = opts;
const { workspaceDir, agentId, config, pluginConfig, sessions } = opts;
const data = await readProjects(workspaceDir);
const projectIds = Object.keys(data.projects);
@@ -250,6 +255,7 @@ export async function tick(opts: {
workspaceDir,
groupId,
project,
sessions,
);
// Budget check: stop if we've hit the limit
@@ -304,6 +310,7 @@ async function performHealthPass(
workspaceDir: string,
groupId: string,
project: any,
sessions: SessionLookup,
): Promise<number> {
const { provider } = await createProvider({ repo: project.repo });
let fixedCount = 0;
@@ -314,7 +321,7 @@ async function performHealthPass(
groupId,
project,
role,
activeSessions: [],
sessions,
autoFix: true,
provider,
});
@@ -332,5 +339,3 @@ async function checkProjectActive(workspaceDir: string, groupId: string): Promis
if (!fresh) return false;
return fresh.dev.active || fresh.qa.active;
}

View File

@@ -1,13 +1,21 @@
/**
* health — Worker health scan with optional auto-fix.
*
* Triangulates projects.json, issue labels, and session state to detect:
* - session_dead: active worker but session missing in gateway
* - label_mismatch: active worker but issue not in expected label
* - stale_worker: active for >2h
* - stuck_label: inactive but issue has Doing/Testing label
* - orphan_issue_id: inactive but issueId set
* - issue_gone: active but issue deleted/closed
*
* Read-only by default (surfaces issues). Pass fix=true to apply fixes.
*/
import { jsonResult } from "openclaw/plugin-sdk";
import type { ToolContext } from "../types.js";
import { readProjects, getProject } from "../projects.js";
import { log as auditLog } from "../audit.js";
import { checkWorkerHealth, type HealthFix } from "../services/health.js";
import { checkWorkerHealth, fetchGatewaySessions, type HealthFix } from "../services/health.js";
import { requireWorkspaceDir, resolveProvider } from "../tool-helpers.js";
export function createHealthTool() {
@@ -20,20 +28,21 @@ export function createHealthTool() {
properties: {
projectGroupId: { type: "string", description: "Filter to specific project. Omit for all." },
fix: { type: "boolean", description: "Apply fixes for detected issues. Default: false (read-only)." },
activeSessions: { type: "array", items: { type: "string" }, description: "Active session IDs for zombie detection." },
},
},
async execute(_id: string, params: Record<string, unknown>) {
const workspaceDir = requireWorkspaceDir(ctx);
const fix = (params.fix as boolean) ?? false;
const activeSessions = (params.activeSessions as string[]) ?? [];
const groupId = params.projectGroupId as string | undefined;
const data = await readProjects(workspaceDir);
const projectIds = groupId ? [groupId] : Object.keys(data.projects);
// Fetch gateway sessions once for all projects
const sessions = await fetchGatewaySessions();
const issues: Array<HealthFix & { project: string; role: string }> = [];
for (const pid of projectIds) {
@@ -43,8 +52,13 @@ export function createHealthTool() {
for (const role of ["dev", "qa"] as const) {
const fixes = await checkWorkerHealth({
workspaceDir, groupId: pid, project, role, activeSessions,
autoFix: fix, provider,
workspaceDir,
groupId: pid,
project,
role,
sessions,
autoFix: fix,
provider,
});
issues.push(...fixes.map((f) => ({ ...f, project: project.name, role })));
}
@@ -55,14 +69,15 @@ export function createHealthTool() {
fix,
issuesFound: issues.length,
issuesFixed: issues.filter((i) => i.fixed).length,
sessionsCached: sessions.size,
});
return jsonResult({
success: true,
fix,
projectsScanned: projectIds.length,
sessionsQueried: sessions.size,
issues,
note: activeSessions.length === 0 ? "No activeSessions provided — zombie detection skipped." : undefined,
});
},
});