1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
use anyhow::{bail, Context, Result};
use chrono::Utc;
use super::core::SyncManager;
use super::HUB_BRANCH;
use crate::identity::AgentConfig;
use crate::locks::Heartbeat;
impl SyncManager {
/// Write and optionally push a heartbeat file for this agent.
///
/// Acquires the hub write lock to prevent races with concurrent git
/// operations (fetch, `write_commit_push`) in the same cache worktree.
///
/// # Errors
///
/// Returns an error if the heartbeat file cannot be written or pushed.
pub fn push_heartbeat(&self, agent: &AgentConfig, active_issue_id: Option<i64>) -> Result<()> {
// Acquire the hub write lock to serialize with other cache mutations (#352)
let _lock_guard = self.acquire_lock()?;
let heartbeat = Heartbeat {
agent_id: agent.agent_id.clone(),
last_heartbeat: Utc::now(),
active_issue_id,
machine_id: agent.machine_id.clone(),
};
// Ensure heartbeats directory exists
let hb_dir = self.cache_dir.join("heartbeats");
std::fs::create_dir_all(&hb_dir)?;
let filename = format!("{}.json", agent.agent_id);
let path = hb_dir.join(&filename);
let json = serde_json::to_string_pretty(&heartbeat)?;
std::fs::write(&path, json)?;
// Stage the heartbeat file
self.git_in_cache(&["add", &format!("heartbeats/{filename}")])?;
// Commit (may fail if nothing changed, that's fine)
let msg = format!(
"heartbeat: {} at {}",
agent.agent_id,
Utc::now().format("%Y-%m-%dT%H:%M:%SZ")
);
let commit_result = self.git_commit_in_cache(&["-m", &msg]);
if let Err(e) = &commit_result {
let err_str = e.to_string();
if err_str.contains("nothing to commit") || err_str.contains("no changes added") {
return Ok(());
}
commit_result?;
}
// Push (best-effort — may fail if offline or conflicts)
let push_result = self.git_in_cache(&["push", &self.remote, HUB_BRANCH]);
if let Err(e) = &push_result {
let err_str = e.to_string();
if err_str.contains("Could not resolve host")
|| err_str.contains("Could not read from remote")
{
tracing::warn!("heartbeat push failed (offline), changes saved locally only");
return Ok(());
}
// If push is rejected (conflict), clean dirty state and try pull+push once
if err_str.contains("rejected") || err_str.contains("non-fast-forward") {
// Bail if local has diverged too far — sign of a rebase loop
self.check_divergence()?;
self.clean_dirty_state()?;
if self
.git_in_cache(&["pull", "--rebase", &self.remote, HUB_BRANCH])
.is_err()
{
self.hub_health_check();
self.git_in_cache(&["pull", "--rebase", &self.remote, HUB_BRANCH])?;
}
if let Err(retry_err) = self.git_in_cache(&["push", &self.remote, HUB_BRANCH]) {
tracing::warn!(
"heartbeat push failed after retry (conflict), changes saved locally only: {}",
retry_err
);
}
}
}
Ok(())
}
/// Read all heartbeat files from the V1 cache (`heartbeats/` directory).
///
/// # Errors
///
/// Returns an error if the heartbeats directory cannot be read.
pub fn read_heartbeats(&self) -> Result<Vec<Heartbeat>> {
let dir = self.cache_dir.join("heartbeats");
if !dir.exists() {
return Ok(Vec::new());
}
let mut heartbeats = Vec::new();
for entry in std::fs::read_dir(&dir)? {
let entry = entry?;
let path = entry.path();
if path.extension().is_some_and(|e| e == "json") {
let content = std::fs::read_to_string(&path)?;
if let Ok(hb) = serde_json::from_str::<Heartbeat>(&content) {
heartbeats.push(hb);
}
}
}
Ok(heartbeats)
}
/// Read heartbeats from the V2 layout (`agents/{id}/heartbeat.json`).
///
/// V2 heartbeat files use `timestamp` (RFC 3339) instead of `last_heartbeat`,
/// and may lack `active_issue_id` / `machine_id`. This method converts them
/// into the common `Heartbeat` struct.
///
/// # Errors
///
/// Returns an error if the agents directory cannot be read.
pub fn read_heartbeats_v2(&self) -> Result<Vec<Heartbeat>> {
let agents_dir = self.cache_dir.join("agents");
if !agents_dir.exists() {
return Ok(Vec::new());
}
let mut heartbeats = Vec::new();
for entry in std::fs::read_dir(&agents_dir)? {
let entry = entry?;
if !entry.file_type()?.is_dir() {
continue;
}
let agent_id = entry.file_name().to_string_lossy().to_string();
let hb_path = entry.path().join("heartbeat.json");
if !hb_path.exists() {
continue;
}
let Ok(content) = std::fs::read_to_string(&hb_path) else {
continue;
};
// Try native Heartbeat format first, then V2 JSON format
if let Ok(hb) = serde_json::from_str::<Heartbeat>(&content) {
heartbeats.push(hb);
} else if let Ok(val) = serde_json::from_str::<serde_json::Value>(&content) {
let Some(timestamp) = val
.get("timestamp")
.and_then(|t| t.as_str())
.and_then(|ts| chrono::DateTime::parse_from_rfc3339(ts).ok())
.map(|dt| dt.with_timezone(&Utc))
else {
tracing::warn!(
"corrupt or missing timestamp in heartbeat for agent '{}', skipping",
agent_id
);
continue;
};
let active_issue_id = val
.get("active_issue_id")
.and_then(serde_json::Value::as_i64);
let machine_id = val
.get("machine_id")
.and_then(|v| v.as_str())
.unwrap_or_default()
.to_string();
heartbeats.push(Heartbeat {
agent_id,
last_heartbeat: timestamp,
active_issue_id,
machine_id,
});
}
}
Ok(heartbeats)
}
/// Read heartbeats using the appropriate method based on hub layout version.
///
/// V1: reads `heartbeats/*.json`
/// V2: reads `agents/*/heartbeat.json`, merged with any V1 heartbeats
///
/// # Errors
///
/// Returns an error if heartbeat files cannot be read.
pub fn read_heartbeats_auto(&self) -> Result<Vec<Heartbeat>> {
use std::collections::HashMap;
let mut heartbeats = self.read_heartbeats()?;
if self.is_v2_layout() {
let v2 = self.read_heartbeats_v2()?;
// Merge V2 heartbeats, preferring the one with the most recent timestamp
let mut by_agent: HashMap<String, Heartbeat> = HashMap::new();
for hb in heartbeats.into_iter().chain(v2) {
by_agent
.entry(hb.agent_id.clone())
.and_modify(|existing| {
if hb.last_heartbeat > existing.last_heartbeat {
*existing = hb.clone();
}
})
.or_insert(hb);
}
heartbeats = by_agent.into_values().collect();
}
Ok(heartbeats)
}
/// Create the agent directory on the hub branch if it doesn't exist.
///
/// Creates `agents/{agent_id}/heartbeat.json` with an initial heartbeat.
/// Returns `Ok(true)` if the directory was created, `Ok(false)` if it already existed.
///
/// # Errors
///
/// Returns an error if the directory or heartbeat file cannot be created.
pub fn ensure_agent_dir(&self, agent_id: &str) -> Result<bool> {
if !self.create_agent_dir_files(agent_id)? {
return Ok(false);
}
// Stage and commit
self.git_in_cache(&["add", &format!("agents/{agent_id}/heartbeat.json")])?;
self.git_commit_in_cache(&[
"-m",
&format!("bootstrap: initialize agent directory for {agent_id}"),
])?;
// Push with retry on rebase conflict
for attempt in 0..3 {
let push_result = self.git_in_cache(&["push", &self.remote, HUB_BRANCH]);
match push_result {
Ok(_) => return Ok(true),
Err(e) => {
let err_str = e.to_string();
if err_str.contains("Could not resolve host")
|| err_str.contains("Could not read from remote")
{
return Ok(true); // Offline — commit is local
}
if err_str.contains("rejected") || err_str.contains("non-fast-forward") {
if attempt < 2 {
// Bail if local has diverged too far — sign of a rebase loop
self.check_divergence()?;
if self
.git_in_cache(&["pull", "--rebase", &self.remote, HUB_BRANCH])
.is_err()
{
self.hub_health_check();
self.git_in_cache(&["pull", "--rebase", &self.remote, HUB_BRANCH])?;
}
continue;
}
bail!("Push failed after 3 retries for agent dir {agent_id}");
}
return Err(e);
}
}
}
Ok(true)
}
/// Create the agent directory and heartbeat file on disk (no git ops).
///
/// Returns `Ok(true)` if created, `Ok(false)` if the directory already exists.
pub(super) fn create_agent_dir_files(&self, agent_id: &str) -> Result<bool> {
let agents_dir = self.cache_dir.join("agents").join(agent_id);
if agents_dir.exists() {
return Ok(false);
}
std::fs::create_dir_all(&agents_dir)
.with_context(|| format!("Failed to create agent directory for {agent_id}"))?;
// Write initial heartbeat
let heartbeat = serde_json::json!({
"agent_id": agent_id,
"timestamp": chrono::Utc::now().to_rfc3339(),
"status": "active"
});
let heartbeat_path = agents_dir.join("heartbeat.json");
std::fs::write(&heartbeat_path, serde_json::to_string_pretty(&heartbeat)?)
.with_context(|| "Failed to write initial heartbeat")?;
Ok(true)
}
}