feat: redesign health check to triangulate projects.json, issue label, and session state (#143) (#145)
## Changes - Remove `activeSessions` parameter from health check (was never populated) - Add gateway session lookup via `openclaw gateway call status` - Add issue label lookup via `provider.getIssue(issueId)` - Implement detection matrix with 6 issue types: - session_dead: active worker but session missing in gateway - label_mismatch: active worker but issue not in Doing/Testing - stale_worker: active for >2h - stuck_label: inactive but issue has Doing/Testing label - orphan_issue_id: inactive but issueId set - issue_gone: active but issue deleted/closed ## Files - lib/services/health.ts — complete rewrite with three-source triangulation - lib/tools/health.ts — remove activeSessions param, fetch sessions from gateway - lib/services/heartbeat.ts — remove empty activeSessions calls, pass sessions map
This commit is contained in:
@@ -1,19 +1,42 @@
|
|||||||
/**
|
/**
|
||||||
* Health service — worker health checks and auto-fix.
|
* Health service — worker health checks and auto-fix.
|
||||||
*
|
*
|
||||||
* Detects: active_no_session, zombie_session, stale_worker, inactive_with_issue.
|
* Triangulates THREE sources of truth:
|
||||||
* Used by both `status` (read-only) and `auto_pickup` (auto-fix).
|
* 1. projects.json — worker state (active, issueId, level, sessions)
|
||||||
|
* 2. Issue label — current GitHub/GitLab label (Doing, Testing, To Do, etc.)
|
||||||
|
* 3. Session state — whether the OpenClaw session exists via gateway status
|
||||||
|
*
|
||||||
|
* Detection matrix:
|
||||||
|
* | projects.json | Issue label | Session | Action |
|
||||||
|
* |---------------|-------------------|--------------|-------------------------------------------|
|
||||||
|
* | active | Doing/Testing ✅ | dead/missing | Deactivate worker, revert to To Do/To Test |
|
||||||
|
* | active | NOT Doing/Testing | any | Deactivate worker (moved externally) |
|
||||||
|
* | active | Doing/Testing ✅ | alive | Healthy (flag if stale >2h) |
|
||||||
|
* | inactive | Doing/Testing | any | Revert issue to To Do/To Test (label stuck)|
|
||||||
|
* | inactive | issueId set | any | Clear issueId (warning) |
|
||||||
|
* | active | issue deleted | any | Deactivate worker, clear state |
|
||||||
*/
|
*/
|
||||||
import type { StateLabel } from "../providers/provider.js";
|
import type { StateLabel, IssueProvider, Issue } from "../providers/provider.js";
|
||||||
import {
|
import {
|
||||||
getSessionForLevel,
|
getSessionForLevel,
|
||||||
getWorker,
|
getWorker,
|
||||||
updateWorker,
|
updateWorker,
|
||||||
type Project,
|
type Project,
|
||||||
} from "../projects.js";
|
} from "../projects.js";
|
||||||
|
import { runCommand } from "../run-command.js";
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Types
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
export type HealthIssue = {
|
export type HealthIssue = {
|
||||||
type: "active_no_session" | "zombie_session" | "stale_worker" | "inactive_with_issue";
|
type:
|
||||||
|
| "session_dead" // Case 1: active worker but session missing/dead
|
||||||
|
| "label_mismatch" // Case 2: active worker but issue not in Doing/Testing
|
||||||
|
| "stale_worker" // Case 3: active for >2h
|
||||||
|
| "stuck_label" // Case 4: inactive but issue still has Doing/Testing
|
||||||
|
| "orphan_issue_id" // Case 5: inactive but issueId set
|
||||||
|
| "issue_gone"; // Case 6: active but issue deleted/closed
|
||||||
severity: "critical" | "warning";
|
severity: "critical" | "warning";
|
||||||
project: string;
|
project: string;
|
||||||
groupId: string;
|
groupId: string;
|
||||||
@@ -23,6 +46,8 @@ export type HealthIssue = {
|
|||||||
sessionKey?: string | null;
|
sessionKey?: string | null;
|
||||||
hoursActive?: number;
|
hoursActive?: number;
|
||||||
issueId?: string | null;
|
issueId?: string | null;
|
||||||
|
expectedLabel?: string;
|
||||||
|
actualLabel?: string | null;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type HealthFix = {
|
export type HealthFix = {
|
||||||
@@ -32,81 +57,328 @@ export type HealthFix = {
|
|||||||
labelRevertFailed?: boolean;
|
labelRevertFailed?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export type GatewaySession = {
|
||||||
|
key: string;
|
||||||
|
updatedAt: number;
|
||||||
|
percentUsed: number;
|
||||||
|
abortedLastRun?: boolean;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type SessionLookup = Map<string, GatewaySession>;
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Gateway session lookup
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Query gateway status and build a lookup map of active sessions.
|
||||||
|
* Caches result for the duration of a health check pass.
|
||||||
|
*/
|
||||||
|
export async function fetchGatewaySessions(): Promise<SessionLookup> {
|
||||||
|
const lookup: SessionLookup = new Map();
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await runCommand(
|
||||||
|
["openclaw", "gateway", "call", "status", "--json"],
|
||||||
|
{ timeoutMs: 15_000 },
|
||||||
|
);
|
||||||
|
|
||||||
|
const data = JSON.parse(result.stdout);
|
||||||
|
const sessions: GatewaySession[] = data?.sessions?.recent ?? [];
|
||||||
|
|
||||||
|
for (const session of sessions) {
|
||||||
|
if (session.key) {
|
||||||
|
lookup.set(session.key, session);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Gateway unavailable — return empty map (all sessions will be treated as missing)
|
||||||
|
}
|
||||||
|
|
||||||
|
return lookup;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if a session key exists in the gateway and is considered "alive".
|
||||||
|
* A session is alive if it exists. We don't consider percentUsed or abortedLastRun
|
||||||
|
* as dead indicators — those are normal states for reusable sessions.
|
||||||
|
*/
|
||||||
|
function isSessionAlive(sessionKey: string, sessions: SessionLookup): boolean {
|
||||||
|
return sessions.has(sessionKey);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Issue label lookup
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch current issue state from the provider.
|
||||||
|
* Returns null if issue doesn't exist or is inaccessible.
|
||||||
|
*/
|
||||||
|
async function fetchIssue(
|
||||||
|
provider: IssueProvider,
|
||||||
|
issueId: number,
|
||||||
|
): Promise<Issue | null> {
|
||||||
|
try {
|
||||||
|
return await provider.getIssue(issueId);
|
||||||
|
} catch {
|
||||||
|
return null; // Issue deleted, closed, or inaccessible
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Health check logic
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Expected in-progress labels for each role.
|
||||||
|
*/
|
||||||
|
const ACTIVE_LABELS: Record<"dev" | "qa", StateLabel> = {
|
||||||
|
dev: "Doing",
|
||||||
|
qa: "Testing",
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Queue labels to revert to when clearing stuck state.
|
||||||
|
*/
|
||||||
|
const QUEUE_LABELS: Record<"dev" | "qa", StateLabel> = {
|
||||||
|
dev: "To Do",
|
||||||
|
qa: "To Test",
|
||||||
|
};
|
||||||
|
|
||||||
export async function checkWorkerHealth(opts: {
|
export async function checkWorkerHealth(opts: {
|
||||||
workspaceDir: string;
|
workspaceDir: string;
|
||||||
groupId: string;
|
groupId: string;
|
||||||
project: Project;
|
project: Project;
|
||||||
role: "dev" | "qa";
|
role: "dev" | "qa";
|
||||||
activeSessions: string[];
|
|
||||||
autoFix: boolean;
|
autoFix: boolean;
|
||||||
provider: {
|
provider: IssueProvider;
|
||||||
transitionLabel(id: number, from: StateLabel, to: StateLabel): Promise<void>;
|
sessions: SessionLookup;
|
||||||
};
|
|
||||||
}): Promise<HealthFix[]> {
|
}): Promise<HealthFix[]> {
|
||||||
const { workspaceDir, groupId, project, role, activeSessions, autoFix, provider } = opts;
|
const { workspaceDir, groupId, project, role, autoFix, provider, sessions } = opts;
|
||||||
const fixes: HealthFix[] = [];
|
const fixes: HealthFix[] = [];
|
||||||
const worker = getWorker(project, role);
|
const worker = getWorker(project, role);
|
||||||
const sessionKey = worker.level ? getSessionForLevel(worker, worker.level) : null;
|
const sessionKey = worker.level ? getSessionForLevel(worker, worker.level) : null;
|
||||||
|
|
||||||
const revertLabel: StateLabel = role === "dev" ? "To Do" : "To Test";
|
const expectedLabel = ACTIVE_LABELS[role];
|
||||||
const currentLabel: StateLabel = role === "dev" ? "Doing" : "Testing";
|
const queueLabel = QUEUE_LABELS[role];
|
||||||
|
|
||||||
async function revertIssueLabel(fix: HealthFix) {
|
// Parse issueId (may be comma-separated for batch, take first)
|
||||||
if (!worker.issueId) return;
|
const issueIdNum = worker.issueId ? Number(worker.issueId.split(",")[0]) : null;
|
||||||
|
|
||||||
|
// Fetch issue state if we have an issueId
|
||||||
|
let issue: Issue | null = null;
|
||||||
|
let currentLabel: StateLabel | null = null;
|
||||||
|
if (issueIdNum) {
|
||||||
|
issue = await fetchIssue(provider, issueIdNum);
|
||||||
|
currentLabel = issue ? provider.getCurrentStateLabel(issue) : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper to revert label
|
||||||
|
async function revertLabel(fix: HealthFix, from: StateLabel, to: StateLabel) {
|
||||||
|
if (!issueIdNum) return;
|
||||||
try {
|
try {
|
||||||
const id = Number(worker.issueId.split(",")[0]);
|
await provider.transitionLabel(issueIdNum, from, to);
|
||||||
await provider.transitionLabel(id, currentLabel, revertLabel);
|
fix.labelReverted = `${from} → ${to}`;
|
||||||
fix.labelReverted = `${currentLabel} → ${revertLabel}`;
|
|
||||||
} catch {
|
} catch {
|
||||||
fix.labelRevertFailed = true;
|
fix.labelRevertFailed = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check 1: Active but no session key for current level
|
// Helper to deactivate worker
|
||||||
|
async function deactivate(clearSessions = false) {
|
||||||
|
const updates: Record<string, unknown> = {
|
||||||
|
active: false,
|
||||||
|
issueId: null,
|
||||||
|
startTime: null,
|
||||||
|
};
|
||||||
|
if (clearSessions && worker.level) {
|
||||||
|
updates.sessions = { ...worker.sessions, [worker.level]: null };
|
||||||
|
}
|
||||||
|
await updateWorker(workspaceDir, groupId, role, updates);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Case 6: Active but issue doesn't exist (deleted/closed externally)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
if (worker.active && issueIdNum && !issue) {
|
||||||
|
const fix: HealthFix = {
|
||||||
|
issue: {
|
||||||
|
type: "issue_gone",
|
||||||
|
severity: "critical",
|
||||||
|
project: project.name,
|
||||||
|
groupId,
|
||||||
|
role,
|
||||||
|
level: worker.level,
|
||||||
|
sessionKey,
|
||||||
|
issueId: worker.issueId,
|
||||||
|
message: `${role.toUpperCase()} active but issue #${issueIdNum} no longer exists or is closed`,
|
||||||
|
},
|
||||||
|
fixed: false,
|
||||||
|
};
|
||||||
|
if (autoFix) {
|
||||||
|
await deactivate(true);
|
||||||
|
fix.fixed = true;
|
||||||
|
}
|
||||||
|
fixes.push(fix);
|
||||||
|
return fixes; // No point checking further
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Case 2: Active but issue label is NOT the expected in-progress label
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
if (worker.active && issue && currentLabel !== expectedLabel) {
|
||||||
|
const fix: HealthFix = {
|
||||||
|
issue: {
|
||||||
|
type: "label_mismatch",
|
||||||
|
severity: "critical",
|
||||||
|
project: project.name,
|
||||||
|
groupId,
|
||||||
|
role,
|
||||||
|
level: worker.level,
|
||||||
|
sessionKey,
|
||||||
|
issueId: worker.issueId,
|
||||||
|
expectedLabel,
|
||||||
|
actualLabel: currentLabel,
|
||||||
|
message: `${role.toUpperCase()} active but issue #${issueIdNum} has label "${currentLabel}" (expected "${expectedLabel}")`,
|
||||||
|
},
|
||||||
|
fixed: false,
|
||||||
|
};
|
||||||
|
if (autoFix) {
|
||||||
|
await deactivate(true);
|
||||||
|
fix.fixed = true;
|
||||||
|
}
|
||||||
|
fixes.push(fix);
|
||||||
|
return fixes; // State is invalid, don't check session
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Case 1: Active with correct label but session is dead/missing
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
if (worker.active && sessionKey && !isSessionAlive(sessionKey, sessions)) {
|
||||||
|
const fix: HealthFix = {
|
||||||
|
issue: {
|
||||||
|
type: "session_dead",
|
||||||
|
severity: "critical",
|
||||||
|
project: project.name,
|
||||||
|
groupId,
|
||||||
|
role,
|
||||||
|
sessionKey,
|
||||||
|
level: worker.level,
|
||||||
|
issueId: worker.issueId,
|
||||||
|
message: `${role.toUpperCase()} active but session "${sessionKey}" not found in gateway`,
|
||||||
|
},
|
||||||
|
fixed: false,
|
||||||
|
};
|
||||||
|
if (autoFix) {
|
||||||
|
await revertLabel(fix, expectedLabel, queueLabel);
|
||||||
|
await deactivate(true);
|
||||||
|
fix.fixed = true;
|
||||||
|
}
|
||||||
|
fixes.push(fix);
|
||||||
|
return fixes;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Case 1b: Active but no session key at all (shouldn't happen normally)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
if (worker.active && !sessionKey) {
|
if (worker.active && !sessionKey) {
|
||||||
const fix: HealthFix = {
|
const fix: HealthFix = {
|
||||||
issue: {
|
issue: {
|
||||||
type: "active_no_session", severity: "critical",
|
type: "session_dead",
|
||||||
project: project.name, groupId, role,
|
severity: "critical",
|
||||||
|
project: project.name,
|
||||||
|
groupId,
|
||||||
|
role,
|
||||||
level: worker.level,
|
level: worker.level,
|
||||||
message: `${role.toUpperCase()} active but no session for level "${worker.level}"`,
|
issueId: worker.issueId,
|
||||||
|
message: `${role.toUpperCase()} active but no session key for level "${worker.level}"`,
|
||||||
},
|
},
|
||||||
fixed: false,
|
fixed: false,
|
||||||
};
|
};
|
||||||
if (autoFix) {
|
if (autoFix) {
|
||||||
await updateWorker(workspaceDir, groupId, role, { active: false, issueId: null, startTime: null });
|
if (issue && currentLabel === expectedLabel) {
|
||||||
|
await revertLabel(fix, expectedLabel, queueLabel);
|
||||||
|
}
|
||||||
|
await deactivate();
|
||||||
fix.fixed = true;
|
fix.fixed = true;
|
||||||
}
|
}
|
||||||
fixes.push(fix);
|
fixes.push(fix);
|
||||||
|
return fixes;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check 2: Active with session but session is dead (zombie)
|
// ---------------------------------------------------------------------------
|
||||||
if (worker.active && sessionKey && activeSessions.length > 0 && !activeSessions.includes(sessionKey)) {
|
// Case 3: Active with correct label and alive session — check for staleness
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
if (worker.active && worker.startTime && sessionKey && isSessionAlive(sessionKey, sessions)) {
|
||||||
|
const hours = (Date.now() - new Date(worker.startTime).getTime()) / 3_600_000;
|
||||||
|
if (hours > 2) {
|
||||||
const fix: HealthFix = {
|
const fix: HealthFix = {
|
||||||
issue: {
|
issue: {
|
||||||
type: "zombie_session", severity: "critical",
|
type: "stale_worker",
|
||||||
project: project.name, groupId, role,
|
severity: "warning",
|
||||||
sessionKey, level: worker.level,
|
project: project.name,
|
||||||
message: `${role.toUpperCase()} session not in active sessions list`,
|
groupId,
|
||||||
|
role,
|
||||||
|
hoursActive: Math.round(hours * 10) / 10,
|
||||||
|
sessionKey,
|
||||||
|
issueId: worker.issueId,
|
||||||
|
message: `${role.toUpperCase()} active for ${Math.round(hours * 10) / 10}h — may need attention`,
|
||||||
},
|
},
|
||||||
fixed: false,
|
fixed: false,
|
||||||
};
|
};
|
||||||
|
// Stale workers get auto-fixed: revert label and deactivate
|
||||||
if (autoFix) {
|
if (autoFix) {
|
||||||
await revertIssueLabel(fix);
|
await revertLabel(fix, expectedLabel, queueLabel);
|
||||||
const sessions = { ...worker.sessions };
|
await deactivate();
|
||||||
if (worker.level) sessions[worker.level] = null;
|
|
||||||
await updateWorker(workspaceDir, groupId, role, { active: false, issueId: null, startTime: null, sessions });
|
|
||||||
fix.fixed = true;
|
fix.fixed = true;
|
||||||
}
|
}
|
||||||
fixes.push(fix);
|
fixes.push(fix);
|
||||||
}
|
}
|
||||||
|
// Otherwise: healthy, no issues to report
|
||||||
|
}
|
||||||
|
|
||||||
// Check 3: Inactive but still has issueId
|
// ---------------------------------------------------------------------------
|
||||||
|
// Case 4: Inactive but issue has stuck Doing/Testing label
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
if (!worker.active && issue && currentLabel === expectedLabel) {
|
||||||
|
const fix: HealthFix = {
|
||||||
|
issue: {
|
||||||
|
type: "stuck_label",
|
||||||
|
severity: "critical",
|
||||||
|
project: project.name,
|
||||||
|
groupId,
|
||||||
|
role,
|
||||||
|
issueId: worker.issueId,
|
||||||
|
expectedLabel: queueLabel,
|
||||||
|
actualLabel: currentLabel,
|
||||||
|
message: `${role.toUpperCase()} inactive but issue #${issueIdNum} still has "${currentLabel}" label`,
|
||||||
|
},
|
||||||
|
fixed: false,
|
||||||
|
};
|
||||||
|
if (autoFix) {
|
||||||
|
await revertLabel(fix, expectedLabel, queueLabel);
|
||||||
|
// Also clear the issueId if present
|
||||||
|
if (worker.issueId) {
|
||||||
|
await updateWorker(workspaceDir, groupId, role, { issueId: null });
|
||||||
|
}
|
||||||
|
fix.fixed = true;
|
||||||
|
}
|
||||||
|
fixes.push(fix);
|
||||||
|
return fixes;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Case 5: Inactive but still has issueId set (orphan reference)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
if (!worker.active && worker.issueId) {
|
if (!worker.active && worker.issueId) {
|
||||||
const fix: HealthFix = {
|
const fix: HealthFix = {
|
||||||
issue: {
|
issue: {
|
||||||
type: "inactive_with_issue", severity: "warning",
|
type: "orphan_issue_id",
|
||||||
project: project.name, groupId, role,
|
severity: "warning",
|
||||||
|
project: project.name,
|
||||||
|
groupId,
|
||||||
|
role,
|
||||||
issueId: worker.issueId,
|
issueId: worker.issueId,
|
||||||
message: `${role.toUpperCase()} inactive but still has issueId "${worker.issueId}"`,
|
message: `${role.toUpperCase()} inactive but still has issueId "${worker.issueId}"`,
|
||||||
},
|
},
|
||||||
@@ -119,28 +391,5 @@ export async function checkWorkerHealth(opts: {
|
|||||||
fixes.push(fix);
|
fixes.push(fix);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check 4: Active for >2 hours (stale)
|
|
||||||
if (worker.active && worker.startTime && sessionKey) {
|
|
||||||
const hours = (Date.now() - new Date(worker.startTime).getTime()) / 3_600_000;
|
|
||||||
if (hours > 2) {
|
|
||||||
const fix: HealthFix = {
|
|
||||||
issue: {
|
|
||||||
type: "stale_worker", severity: "warning",
|
|
||||||
project: project.name, groupId, role,
|
|
||||||
hoursActive: Math.round(hours * 10) / 10,
|
|
||||||
sessionKey, issueId: worker.issueId,
|
|
||||||
message: `${role.toUpperCase()} active for ${Math.round(hours * 10) / 10}h — may need attention`,
|
|
||||||
},
|
|
||||||
fixed: false,
|
|
||||||
};
|
|
||||||
if (autoFix) {
|
|
||||||
await revertIssueLabel(fix);
|
|
||||||
await updateWorker(workspaceDir, groupId, role, { active: false, issueId: null, startTime: null });
|
|
||||||
fix.fixed = true;
|
|
||||||
}
|
|
||||||
fixes.push(fix);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return fixes;
|
return fixes;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ import fs from "node:fs";
|
|||||||
import path from "node:path";
|
import path from "node:path";
|
||||||
import { readProjects } from "../projects.js";
|
import { readProjects } from "../projects.js";
|
||||||
import { log as auditLog } from "../audit.js";
|
import { log as auditLog } from "../audit.js";
|
||||||
import { checkWorkerHealth } from "./health.js";
|
import { checkWorkerHealth, fetchGatewaySessions, type SessionLookup } from "./health.js";
|
||||||
import { projectTick } from "./tick.js";
|
import { projectTick } from "./tick.js";
|
||||||
import { createProvider } from "../providers/index.js";
|
import { createProvider } from "../providers/index.js";
|
||||||
import { notifyTickPickups, getNotificationConfig } from "../notify.js";
|
import { notifyTickPickups, getNotificationConfig } from "../notify.js";
|
||||||
@@ -184,12 +184,16 @@ async function processAllAgents(
|
|||||||
totalSkipped: 0,
|
totalSkipped: 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Fetch gateway sessions once for all agents/projects
|
||||||
|
const sessions = await fetchGatewaySessions();
|
||||||
|
|
||||||
for (const { agentId, workspace } of agents) {
|
for (const { agentId, workspace } of agents) {
|
||||||
const agentResult = await tick({
|
const agentResult = await tick({
|
||||||
workspaceDir: workspace,
|
workspaceDir: workspace,
|
||||||
agentId,
|
agentId,
|
||||||
config,
|
config,
|
||||||
pluginConfig,
|
pluginConfig,
|
||||||
|
sessions,
|
||||||
logger,
|
logger,
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -221,9 +225,10 @@ export async function tick(opts: {
|
|||||||
agentId?: string;
|
agentId?: string;
|
||||||
config: HeartbeatConfig;
|
config: HeartbeatConfig;
|
||||||
pluginConfig?: Record<string, unknown>;
|
pluginConfig?: Record<string, unknown>;
|
||||||
|
sessions: SessionLookup;
|
||||||
logger: { info(msg: string): void; warn(msg: string): void };
|
logger: { info(msg: string): void; warn(msg: string): void };
|
||||||
}): Promise<TickResult> {
|
}): Promise<TickResult> {
|
||||||
const { workspaceDir, agentId, config, pluginConfig } = opts;
|
const { workspaceDir, agentId, config, pluginConfig, sessions } = opts;
|
||||||
|
|
||||||
const data = await readProjects(workspaceDir);
|
const data = await readProjects(workspaceDir);
|
||||||
const projectIds = Object.keys(data.projects);
|
const projectIds = Object.keys(data.projects);
|
||||||
@@ -250,6 +255,7 @@ export async function tick(opts: {
|
|||||||
workspaceDir,
|
workspaceDir,
|
||||||
groupId,
|
groupId,
|
||||||
project,
|
project,
|
||||||
|
sessions,
|
||||||
);
|
);
|
||||||
|
|
||||||
// Budget check: stop if we've hit the limit
|
// Budget check: stop if we've hit the limit
|
||||||
@@ -304,6 +310,7 @@ async function performHealthPass(
|
|||||||
workspaceDir: string,
|
workspaceDir: string,
|
||||||
groupId: string,
|
groupId: string,
|
||||||
project: any,
|
project: any,
|
||||||
|
sessions: SessionLookup,
|
||||||
): Promise<number> {
|
): Promise<number> {
|
||||||
const { provider } = await createProvider({ repo: project.repo });
|
const { provider } = await createProvider({ repo: project.repo });
|
||||||
let fixedCount = 0;
|
let fixedCount = 0;
|
||||||
@@ -314,7 +321,7 @@ async function performHealthPass(
|
|||||||
groupId,
|
groupId,
|
||||||
project,
|
project,
|
||||||
role,
|
role,
|
||||||
activeSessions: [],
|
sessions,
|
||||||
autoFix: true,
|
autoFix: true,
|
||||||
provider,
|
provider,
|
||||||
});
|
});
|
||||||
@@ -332,5 +339,3 @@ async function checkProjectActive(workspaceDir: string, groupId: string): Promis
|
|||||||
if (!fresh) return false;
|
if (!fresh) return false;
|
||||||
return fresh.dev.active || fresh.qa.active;
|
return fresh.dev.active || fresh.qa.active;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,13 +1,21 @@
|
|||||||
/**
|
/**
|
||||||
* health — Worker health scan with optional auto-fix.
|
* health — Worker health scan with optional auto-fix.
|
||||||
*
|
*
|
||||||
|
* Triangulates projects.json, issue labels, and session state to detect:
|
||||||
|
* - session_dead: active worker but session missing in gateway
|
||||||
|
* - label_mismatch: active worker but issue not in expected label
|
||||||
|
* - stale_worker: active for >2h
|
||||||
|
* - stuck_label: inactive but issue has Doing/Testing label
|
||||||
|
* - orphan_issue_id: inactive but issueId set
|
||||||
|
* - issue_gone: active but issue deleted/closed
|
||||||
|
*
|
||||||
* Read-only by default (surfaces issues). Pass fix=true to apply fixes.
|
* Read-only by default (surfaces issues). Pass fix=true to apply fixes.
|
||||||
*/
|
*/
|
||||||
import { jsonResult } from "openclaw/plugin-sdk";
|
import { jsonResult } from "openclaw/plugin-sdk";
|
||||||
import type { ToolContext } from "../types.js";
|
import type { ToolContext } from "../types.js";
|
||||||
import { readProjects, getProject } from "../projects.js";
|
import { readProjects, getProject } from "../projects.js";
|
||||||
import { log as auditLog } from "../audit.js";
|
import { log as auditLog } from "../audit.js";
|
||||||
import { checkWorkerHealth, type HealthFix } from "../services/health.js";
|
import { checkWorkerHealth, fetchGatewaySessions, type HealthFix } from "../services/health.js";
|
||||||
import { requireWorkspaceDir, resolveProvider } from "../tool-helpers.js";
|
import { requireWorkspaceDir, resolveProvider } from "../tool-helpers.js";
|
||||||
|
|
||||||
export function createHealthTool() {
|
export function createHealthTool() {
|
||||||
@@ -20,20 +28,21 @@ export function createHealthTool() {
|
|||||||
properties: {
|
properties: {
|
||||||
projectGroupId: { type: "string", description: "Filter to specific project. Omit for all." },
|
projectGroupId: { type: "string", description: "Filter to specific project. Omit for all." },
|
||||||
fix: { type: "boolean", description: "Apply fixes for detected issues. Default: false (read-only)." },
|
fix: { type: "boolean", description: "Apply fixes for detected issues. Default: false (read-only)." },
|
||||||
activeSessions: { type: "array", items: { type: "string" }, description: "Active session IDs for zombie detection." },
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|
||||||
async execute(_id: string, params: Record<string, unknown>) {
|
async execute(_id: string, params: Record<string, unknown>) {
|
||||||
const workspaceDir = requireWorkspaceDir(ctx);
|
const workspaceDir = requireWorkspaceDir(ctx);
|
||||||
const fix = (params.fix as boolean) ?? false;
|
const fix = (params.fix as boolean) ?? false;
|
||||||
const activeSessions = (params.activeSessions as string[]) ?? [];
|
|
||||||
|
|
||||||
const groupId = params.projectGroupId as string | undefined;
|
const groupId = params.projectGroupId as string | undefined;
|
||||||
|
|
||||||
const data = await readProjects(workspaceDir);
|
const data = await readProjects(workspaceDir);
|
||||||
const projectIds = groupId ? [groupId] : Object.keys(data.projects);
|
const projectIds = groupId ? [groupId] : Object.keys(data.projects);
|
||||||
|
|
||||||
|
// Fetch gateway sessions once for all projects
|
||||||
|
const sessions = await fetchGatewaySessions();
|
||||||
|
|
||||||
const issues: Array<HealthFix & { project: string; role: string }> = [];
|
const issues: Array<HealthFix & { project: string; role: string }> = [];
|
||||||
|
|
||||||
for (const pid of projectIds) {
|
for (const pid of projectIds) {
|
||||||
@@ -43,8 +52,13 @@ export function createHealthTool() {
|
|||||||
|
|
||||||
for (const role of ["dev", "qa"] as const) {
|
for (const role of ["dev", "qa"] as const) {
|
||||||
const fixes = await checkWorkerHealth({
|
const fixes = await checkWorkerHealth({
|
||||||
workspaceDir, groupId: pid, project, role, activeSessions,
|
workspaceDir,
|
||||||
autoFix: fix, provider,
|
groupId: pid,
|
||||||
|
project,
|
||||||
|
role,
|
||||||
|
sessions,
|
||||||
|
autoFix: fix,
|
||||||
|
provider,
|
||||||
});
|
});
|
||||||
issues.push(...fixes.map((f) => ({ ...f, project: project.name, role })));
|
issues.push(...fixes.map((f) => ({ ...f, project: project.name, role })));
|
||||||
}
|
}
|
||||||
@@ -55,14 +69,15 @@ export function createHealthTool() {
|
|||||||
fix,
|
fix,
|
||||||
issuesFound: issues.length,
|
issuesFound: issues.length,
|
||||||
issuesFixed: issues.filter((i) => i.fixed).length,
|
issuesFixed: issues.filter((i) => i.fixed).length,
|
||||||
|
sessionsCached: sessions.size,
|
||||||
});
|
});
|
||||||
|
|
||||||
return jsonResult({
|
return jsonResult({
|
||||||
success: true,
|
success: true,
|
||||||
fix,
|
fix,
|
||||||
projectsScanned: projectIds.length,
|
projectsScanned: projectIds.length,
|
||||||
|
sessionsQueried: sessions.size,
|
||||||
issues,
|
issues,
|
||||||
note: activeSessions.length === 0 ? "No activeSessions provided — zombie detection skipped." : undefined,
|
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user