Skip to main content

hematite/agent/
correlation.rs

1/// Cross-topic finding correlation — detects when independent inspect_host
2/// findings share a common root cause and synthesizes a unified diagnosis.
3///
4/// Each rule requires ALL of its signals to appear in the combined raw output
5/// (AND logic). When a rule fires it means two or more separate topics reported
6/// conditions that almost certainly trace back to one underlying problem.
7pub struct CorrelatedFinding {
8    pub confidence: &'static str, // "HIGH" or "MEDIUM"
9    pub summary: &'static str,    // one-sentence root cause
10    pub detail: &'static str,     // why these findings are connected
11    pub unified_steps: &'static [&'static str],
12}
13
14struct CorrelationRule {
15    signals: &'static [&'static str], // ALL must appear in lowercased combined output
16    confidence: &'static str,
17    summary: &'static str,
18    detail: &'static str,
19    unified_steps: &'static [&'static str],
20}
21
22static RULES: &[CorrelationRule] = &[
23    // ── Drive failure causing system crashes ──────────────────────────────────
24    CorrelationRule {
25        signals: &["healthstatus: unhealthy", "system crashes / unexpected shutdowns:"],
26        confidence: "HIGH",
27        summary: "Failing drive is almost certainly causing these system crashes",
28        detail: "Disk health shows an unhealthy drive at the same time the crash log shows \
29                 unexpected shutdowns. Drives with SMART failures cause read errors that \
30                 Windows cannot recover from gracefully, triggering BSODs.",
31        unified_steps: &[
32            "Back up all important files immediately — do not wait, the drive may fail completely at any time",
33            "Verify the failure: PowerShell (admin) → Get-PhysicalDisk | Select FriendlyName, HealthStatus — if Unhealthy, schedule replacement",
34            "Run hematite --inspect disk_health for the full SMART status before replacing",
35            "Replace the drive before doing anything else — no software fix will resolve hardware failure",
36            "After replacement: reinstall Windows from USB or restore from backup",
37        ],
38    },
39
40    // ── Drive failure + nearly full — critical double jeopardy ────────────────
41    CorrelationRule {
42        signals: &["healthstatus: unhealthy", "very low"],
43        confidence: "HIGH",
44        summary: "Drive is both failing and almost full — immediate data loss risk",
45        detail: "A physically unhealthy drive that is also nearly out of space faces two \
46                 simultaneous failure modes. Writes to a full drive with SMART errors have \
47                 an elevated chance of corruption.",
48        unified_steps: &[
49            "URGENT: copy your most important files to an external drive or cloud storage right now",
50            "Do not install anything, run updates, or make large writes until the drive is replaced",
51            "Run hematite --inspect disk_health to confirm the SMART status",
52            "Replace the drive as soon as possible — this combination is high-risk for total data loss",
53        ],
54    },
55
56    // ── Drive saturation caused by SMART errors ───────────────────────────────
57    CorrelationRule {
58        signals: &["average disk queue", "healthstatus: unhealthy"],
59        confidence: "HIGH",
60        summary: "100% disk usage is caused by the failing drive retrying bad sectors",
61        detail: "High disk queue length combined with an unhealthy SMART status is a classic \
62                 pattern: the OS is retrying reads and writes on failing sectors, creating the \
63                 appearance of disk saturation. The drive is the root cause, not software.",
64        unified_steps: &[
65            "Do not defrag or benchmark the drive — it will accelerate failure",
66            "Back up data immediately, then replace the drive",
67            "After replacement the disk saturation will resolve automatically",
68            "Run hematite --inspect disk_health to get the exact SMART error count before replacing",
69        ],
70    },
71
72    // ── Thermal throttling causing crashes ────────────────────────────────────
73    CorrelationRule {
74        signals: &["throttle reason:", "bsod (bugcheck)"],
75        confidence: "HIGH",
76        summary: "Overheating is triggering these system crashes",
77        detail: "The overclocker telemetry shows active thermal throttling at the same time \
78                 the crash log records BSODs. When a CPU or GPU exceeds its thermal limit the \
79                 system can become unstable and crash rather than just slow down.",
80        unified_steps: &[
81            "Clean the CPU and GPU heatsinks with compressed air — dust buildup is the most common cause",
82            "Check that all case fans are spinning and airflow is not blocked",
83            "Run hematite --inspect thermal and --inspect overclocker to see current temps under load",
84            "For laptops: use a cooling pad and ensure the bottom vents are not blocked",
85            "If temps remain high after cleaning: reapply thermal paste (CPU) or replace thermal pads (GPU)",
86            "Only investigate the BSODs further after the thermal issue is resolved — they may stop on their own",
87        ],
88    },
89
90    // ── M365 auth broker cascade: Teams + Outlook both affected ──────────────
91    CorrelationRule {
92        signals: &[
93            "tokenbroker | status: stopped",
94            "classicteamscache |",
95            "profilecount:",
96        ],
97        confidence: "HIGH",
98        summary: "One auth broker failure is breaking Teams AND Outlook simultaneously",
99        detail: "The Microsoft 365 token broker (WAM) is not running. Teams and Outlook both \
100                 depend on the same authentication service — fixing the broker resolves sign-in \
101                 failures in both apps at once. Clearing Teams cache or recreating Outlook \
102                 profiles before fixing the broker will not work.",
103        unified_steps: &[
104            "Fix the auth broker first: PowerShell (admin) → Restart-Service TokenBroker -Force",
105            "If the service won't start: PowerShell (admin) → sfc /scannow to repair the system files WAM depends on",
106            "Sign out and back in to your work account: Settings → Accounts → Access work or school → disconnect → reconnect",
107            "Only after the broker is running: clear Teams cache (%AppData%\\Microsoft\\Teams\\Cache, etc.)",
108            "Only after the broker is running: test Outlook sign-in — in most cases it recovers without further steps",
109            "If the device is Intune/AAD joined and the broker still fails: run dsregcmd /leave then dsregcmd /join (admin)",
110        ],
111    },
112
113    // ── Auth broker + Teams only ──────────────────────────────────────────────
114    CorrelationRule {
115        signals: &["tokenbroker | status: stopped", "classicteamscache |"],
116        confidence: "HIGH",
117        summary: "Teams sign-in failure is caused by the M365 auth broker, not the cache",
118        detail: "The token broker (WAM) is not running. Clearing the Teams cache without \
119                 fixing the auth broker will result in Teams asking to sign in and then \
120                 failing silently. Fix the broker first.",
121        unified_steps: &[
122            "Restart the token broker: PowerShell (admin) → Restart-Service TokenBroker -Force",
123            "Verify it is running: Get-Service TokenBroker | Select Status",
124            "Only after the broker is confirmed running: clear the Teams cache and relaunch Teams",
125            "If sign-in still fails after clearing cache: run hematite --inspect identity_auth for the full auth chain state",
126        ],
127    },
128
129    // ── Auth broker + Outlook only ────────────────────────────────────────────
130    CorrelationRule {
131        signals: &["tokenbroker | status: stopped", "profilecount:"],
132        confidence: "HIGH",
133        summary: "Outlook sign-in failure is caused by the M365 auth broker",
134        detail: "The WAM token broker is not running. Outlook will loop on the sign-in screen \
135                 or show repeated password prompts until the broker is restored. Recreating the \
136                 mail profile before fixing the broker will reproduce the same failure.",
137        unified_steps: &[
138            "Restart the token broker: PowerShell (admin) → Restart-Service TokenBroker -Force",
139            "Clear stale Office credentials: Win+R → control keymgr.dll → Windows Credentials → remove all MicrosoftOffice16_Data:SSPI:* entries",
140            "Relaunch Outlook — it should sign in automatically once the broker is running",
141            "If Outlook still fails: run hematite --inspect identity_auth for the full auth chain state",
142        ],
143    },
144
145    // ── Pending reboot causing system crashes ─────────────────────────────────
146    CorrelationRule {
147        signals: &[
148            "windows update requires a restart",
149            "system crashes / unexpected shutdowns:",
150        ],
151        confidence: "HIGH",
152        summary: "Incomplete Windows Update is likely causing these crashes",
153        detail: "A Windows Update is staged but the system has not restarted to complete \
154                 installation. Half-applied updates leave driver and system files in a mixed \
155                 state that can cause BSODs and unexpected shutdowns until the restart completes.",
156        unified_steps: &[
157            "Save all open work and restart the computer to complete the pending update",
158            "After restart, run Windows Update again to confirm no further updates are pending",
159            "If crashes continue after the restart: run hematite --inspect recent_crashes to check for a different root cause",
160        ],
161    },
162
163    // ── WMI corruption causing multiple failures ──────────────────────────────
164    CorrelationRule {
165        signals: &["wmi repository is inconsistent", "system crashes / unexpected shutdowns:"],
166        confidence: "HIGH",
167        summary: "WMI corruption is the likely root cause of multiple tool failures and crashes",
168        detail: "WMI (Windows Management Instrumentation) is the data bus that Windows, \
169                 Defender, PowerShell, and many system tools depend on. A corrupt repository \
170                 causes cascading failures across diagnostics, updates, and system stability. \
171                 Fix WMI before investigating any other issues.",
172        unified_steps: &[
173            "Stop WMI: PowerShell (admin) → net stop winmgmt /y",
174            "Rebuild the repository: PowerShell (admin) → winmgmt /resetrepository",
175            "Start WMI: PowerShell (admin) → net start winmgmt",
176            "Verify: PowerShell → winmgmt /verifyrepository — should say 'WMI repository is consistent'",
177            "Restart the machine after repair — WMI caches are session-scoped",
178            "After restart, rerun hematite --triage to see if other issues have resolved themselves",
179        ],
180    },
181
182    // ── VPN blocking internet connectivity ────────────────────────────────────
183    CorrelationRule {
184        signals: &["vpn adapter detected", "unreachable"],
185        confidence: "MEDIUM",
186        summary: "Active VPN may be causing the connectivity failure",
187        detail: "A VPN adapter is present and active at the same time the connectivity \
188                 check reports the internet as unreachable. VPN misrouting, split-tunnel \
189                 issues, or a disconnected VPN tunnel that still holds the routing table \
190                 are common causes of this pattern.",
191        unified_steps: &[
192            "Disconnect the VPN and test internet connectivity directly: ping 1.1.1.1",
193            "If internet works without VPN: the VPN client or tunnel config is the problem — contact your VPN provider or IT admin",
194            "If internet is still broken without VPN: run hematite --inspect connectivity for the base network issue",
195            "Flush DNS after disconnecting VPN: PowerShell (admin) → ipconfig /flushdns",
196            "Check for stale routes left by the VPN: PowerShell (admin) → route print — delete any routes pointing to the VPN adapter",
197        ],
198    },
199
200    // ── Teams cache + crash evidence ─────────────────────────────────────────
201    CorrelationRule {
202        signals: &["classicteamscache |", "application error |"],
203        confidence: "MEDIUM",
204        summary: "Bloated Teams cache is likely causing Teams to crash",
205        detail: "The Teams cache directory is large and there is crash evidence in the \
206                 Application event log. A corrupted or oversized cache is the #1 cause of \
207                 Teams instability — clearing it resolves most Teams crash and hang issues.",
208        unified_steps: &[
209            "Quit Teams completely: right-click the system tray icon → Quit",
210            "Clear Classic Teams cache: Win+R → %AppData%\\Microsoft\\Teams → delete Cache, blob_storage, databases, GPUCache, IndexedDB, Local Storage, tmp",
211            "Clear New Teams cache: Win+R → %LocalAppData%\\Packages\\MSTeams_8wekyb3d8bbwe\\LocalCache\\ → delete all contents",
212            "Relaunch Teams and sign in — the cache rebuilds from the server automatically",
213            "If Teams still crashes after cache clear: run hematite --inspect identity_auth to check auth broker state",
214        ],
215    },
216
217    // ── Defender disabled with active outbound connections ───────────────────
218    CorrelationRule {
219        signals: &["real-time protection: off", "established"],
220        confidence: "MEDIUM",
221        summary: "Defender is disabled while the machine has active network connections — security risk",
222        detail: "Real-time protection is off at the same time the machine has established \
223                 TCP connections. Without Defender active, malware can exfiltrate data or \
224                 phone home over these connections without being intercepted.",
225        unified_steps: &[
226            "Re-enable Defender real-time protection immediately: Windows Security → Virus & threat protection → Real-time protection → On",
227            "Run a full scan: Windows Security → Virus & threat protection → Quick scan (then Full scan if quick finds anything)",
228            "Check what process owns the active connections: run hematite --inspect connections and look for unknown remote addresses",
229            "If you cannot turn Defender back on, a third-party AV or Group Policy may be blocking it — run hematite --inspect security for the full picture",
230        ],
231    },
232
233    // ── Thermal throttling causing high CPU readings (no crash) ──────────────
234    CorrelationRule {
235        signals: &["[warning] cpu load is extremely high", "throttle reason:"],
236        confidence: "HIGH",
237        summary: "Thermal throttling is the root cause of the high CPU readings",
238        detail: "The system reports near-100% CPU at the same time the thermal subsystem \
239                 is actively throttling the processor. Windows measures CPU utilization as \
240                 a fraction of the current throttled frequency — a CPU running at 30% of \
241                 its rated speed will show 100% load even under a light workload. Fixing \
242                 the thermal issue will resolve the apparent CPU saturation without touching \
243                 any running applications.",
244        unified_steps: &[
245            "Clean the CPU and GPU heatsinks with compressed air — dust buildup is the most common cause of thermal throttling",
246            "Check that all case fans are spinning and airflow paths are not blocked",
247            "For laptops: run on a hard flat surface with bottom vents unobstructed, or use a cooling pad",
248            "Run hematite --inspect thermal to see current temperatures under load",
249            "Run hematite --inspect overclocker to see the active throttle reason (Power vs Thermal)",
250            "If temperatures are normal but throttling persists: check the active power plan — Power Saver or Balanced can cap CPU frequency",
251            "After fixing cooling: the high CPU reading will drop because the chip can now run at its rated speed",
252        ],
253    },
254
255    // ── RAM pressure causing disk saturation via pagefile swapping ───────────
256    CorrelationRule {
257        signals: &["[warning] memory usage is near capacity", "average disk queue"],
258        confidence: "HIGH",
259        summary: "Low RAM is causing disk saturation through pagefile swapping",
260        detail: "Resource monitor shows RAM near capacity at the same time disk queue \
261                 length is elevated. When physical RAM is exhausted, Windows offloads \
262                 active memory pages to the pagefile on the system drive. This appears \
263                 as near-100% disk usage and makes the machine feel completely \
264                 unresponsive. The root cause is RAM pressure, not a disk hardware problem.",
265        unified_steps: &[
266            "Identify the RAM consumer: Task Manager → Performance → Memory → Open Resource Monitor → Memory tab",
267            "Close unused apps — browsers (50–200 MB per tab), Electron apps (Teams, Slack, VS Code), and IDEs are the most common culprits",
268            "Run hematite --inspect resource_load to rank processes by memory consumption right now",
269            "If RAM is chronically full with normal use: this machine needs a RAM upgrade",
270            "Short-term relief: increase the pagefile size — System Properties → Advanced → Performance Settings → Advanced → Virtual Memory → Custom size",
271            "Do not defrag while this condition is active — the disk is already under heavy pagefile I/O load",
272        ],
273    },
274
275    // ── Windows Installer disabled + CBS reboot pending ──────────────────────
276    CorrelationRule {
277        signals: &[
278            "windows installer service (msiserver) is disabled",
279            "windows component install/update requires a restart",
280        ],
281        confidence: "HIGH",
282        summary: "Pending reboot and disabled Windows Installer are both blocking software installations",
283        detail: "The Windows Installer service (msiserver) is disabled — no MSI-based \
284                 application can install or update. A Windows component update is also \
285                 staged and waiting for a restart. The pending reboot may restore the \
286                 Installer service as part of CBS completion, so restart first before \
287                 manually re-enabling.",
288        unified_steps: &[
289            "Restart the computer first — this completes the pending Windows component update (CBS)",
290            "After restart: check if Windows Installer recovered: PowerShell → Get-Service msiserver",
291            "If msiserver is still disabled: PowerShell (admin) → Set-Service msiserver -StartupType Manual; Start-Service msiserver",
292            "Verify: Get-Service msiserver | Select Status, StartType — expect Running / Manual",
293            "Retry the failed installation once both fixes are confirmed",
294            "Run hematite --inspect installer_health after restart to verify all install-related services are healthy",
295        ],
296    },
297];
298
299/// Run all correlation rules against the combined raw inspect output.
300/// Returns all rules where every signal is present (AND logic).
301/// Results are ordered HIGH confidence first.
302pub fn correlate_findings(raw_output: &str) -> Vec<CorrelatedFinding> {
303    let lower = raw_output.to_ascii_lowercase();
304    let mut results: Vec<CorrelatedFinding> = RULES
305        .iter()
306        .filter(|rule| rule.signals.iter().all(|s| lower.contains(*s)))
307        .map(|rule| CorrelatedFinding {
308            confidence: rule.confidence,
309            summary: rule.summary,
310            detail: rule.detail,
311            unified_steps: rule.unified_steps,
312        })
313        .collect();
314
315    // HIGH confidence first, MEDIUM second.
316    results.sort_by_key(|r| if r.confidence == "HIGH" { 0u8 } else { 1 });
317    results
318}