1use anyhow::{Context, Result};
2use chrono::{DateTime, Utc};
3use glob::Pattern;
4use serde::{Deserialize, Serialize};
5use std::{
6 collections::HashMap,
7 collections::HashSet,
8 env, fs,
9 path::{Component, Path, PathBuf},
10};
11use tracing::warn;
12
13use crate::config::{AppConfig, data_dir, default_sources_path};
14
15#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct SourceDef {
17 pub id: String,
18 pub display_name: Option<String>,
19 pub roots: Vec<String>,
20 pub globs: Vec<String>,
21 pub format: String,
22 pub parser_hint: Option<String>,
23 pub platforms: Option<Vec<String>>,
24 pub requires_opt_in: Option<bool>,
25}
26
27#[derive(Debug, Clone, Default, Serialize, Deserialize)]
28pub struct SourceManifest {
29 pub version: Option<u32>,
30 pub sources: Vec<SourceDef>,
31}
32
33#[derive(Debug, Clone, Serialize, Deserialize)]
34struct CachedRegistry {
35 fetched_at: DateTime<Utc>,
36 etag: Option<String>,
37 manifest: SourceManifest,
38}
39
40pub fn builtin_sources() -> Vec<SourceDef> {
41 vec![
42 SourceDef {
43 id: "codex_cli".to_string(),
44 display_name: Some("Codex CLI".to_string()),
45 roots: vec!["~/.codex/sessions".to_string()],
46 globs: vec!["**/*".to_string()],
47 format: "jsonl".to_string(),
48 parser_hint: Some("codex_cli_v1".to_string()),
49 platforms: None,
50 requires_opt_in: Some(false),
51 },
52 SourceDef {
53 id: "claude_code".to_string(),
54 display_name: Some("Claude Code".to_string()),
55 roots: vec!["~/.claude/projects".to_string()],
56 globs: vec!["**/sessions/*.jsonl".to_string()],
57 format: "jsonl".to_string(),
58 parser_hint: Some("claude_code_v1".to_string()),
59 platforms: None,
60 requires_opt_in: Some(false),
61 },
62 SourceDef {
63 id: "vscode_global_storage".to_string(),
64 display_name: Some("VS Code Global Storage".to_string()),
65 roots: vec![
66 "~/.config/Code/User/globalStorage".to_string(),
67 "~/Library/Application Support/Code/User/globalStorage".to_string(),
68 "~/AppData/Roaming/Code/User/globalStorage".to_string(),
69 ],
70 globs: vec!["**/*.jsonl".to_string(), "**/*.json".to_string()],
71 format: "jsonl".to_string(),
72 parser_hint: Some("vscode_storage_v1".to_string()),
73 platforms: None,
74 requires_opt_in: Some(false),
75 },
76 SourceDef {
77 id: "tandem_sessions".to_string(),
78 display_name: Some("Tandem Sessions".to_string()),
79 roots: vec![
80 "~/.local/share/tandem/data/storage".to_string(),
81 "~/Library/Application Support/tandem/data/storage".to_string(),
82 "~/AppData/Roaming/tandem/data/storage".to_string(),
83 ],
84 globs: vec!["**/sessions.json".to_string()],
85 format: "json".to_string(),
86 parser_hint: Some("tandem_v1".to_string()),
87 platforms: Some(vec![
88 "linux".to_string(),
89 "macos".to_string(),
90 "windows".to_string(),
91 ]),
92 requires_opt_in: Some(true),
93 },
94 ]
95}
96
97pub async fn resolve_sources(config: &AppConfig) -> Result<Vec<SourceDef>> {
98 let mut merged = builtin_sources();
99
100 if config.remote_registry.enabled {
101 if let Ok(remote) = load_remote_registry(config).await {
102 merged.extend(remote.sources);
103 }
104 }
105
106 if let Some(local) = load_local_sources(config)? {
107 merged.extend(local.sources);
108 }
109
110 let merged = merge_with_override(merged);
111 let mut valid = Vec::new();
112 for source in merged {
113 match validate_source(&source) {
114 Ok(()) => valid.push(source),
115 Err(e) => warn!("skipping unsafe source {}: {e}", source.id),
116 }
117 }
118 Ok(valid)
119}
120
121fn merge_with_override(input: Vec<SourceDef>) -> Vec<SourceDef> {
122 let mut out = Vec::new();
123 let mut seen = HashSet::new();
124 for source in input.into_iter().rev() {
125 if seen.insert(source.id.clone()) {
126 out.push(source);
127 }
128 }
129 out.reverse();
130 out
131}
132
133pub fn load_local_sources(config: &AppConfig) -> Result<Option<SourceManifest>> {
134 let p = config
135 .sources_path
136 .clone()
137 .unwrap_or(default_sources_path()?);
138 if !p.exists() {
139 return Ok(None);
140 }
141 let text = fs::read_to_string(&p)
142 .with_context(|| format!("failed reading sources file {}", p.display()))?;
143 let manifest = toml::from_str::<SourceManifest>(&text).context("invalid local sources.toml")?;
144 validate_manifest(&manifest)?;
145 Ok(Some(manifest))
146}
147
148pub fn add_local_source(config: &AppConfig, source: SourceDef) -> Result<PathBuf> {
149 validate_source(&source)?;
150 let path = config
151 .sources_path
152 .clone()
153 .unwrap_or(default_sources_path()?);
154 if let Some(parent) = path.parent() {
155 fs::create_dir_all(parent)?;
156 }
157
158 let mut manifest = if path.exists() {
159 let text = fs::read_to_string(&path)
160 .with_context(|| format!("failed reading sources file {}", path.display()))?;
161 toml::from_str::<SourceManifest>(&text).context("invalid local sources.toml")?
162 } else {
163 SourceManifest {
164 version: Some(1),
165 sources: Vec::new(),
166 }
167 };
168
169 if let Some(existing) = manifest.sources.iter_mut().find(|s| s.id == source.id) {
170 *existing = source;
171 } else {
172 manifest.sources.push(source);
173 }
174
175 manifest.sources.sort_by(|a, b| a.id.cmp(&b.id));
176 let text = toml::to_string_pretty(&manifest)?;
177 fs::write(&path, text)?;
178 Ok(path)
179}
180
181pub async fn load_remote_registry(config: &AppConfig) -> Result<SourceManifest> {
182 let url = config
183 .remote_registry
184 .url
185 .clone()
186 .context("remote registry url missing")?;
187
188 let cache_path = data_dir()?.join("registry-cache.json");
189 let cached = read_cache(&cache_path).ok();
190 let ttl_hours = config.remote_registry.cache_ttl_hours.max(1);
191
192 if let Some(cached) = &cached {
193 let age = Utc::now() - cached.fetched_at;
194 if age.num_hours() < ttl_hours as i64 {
195 return Ok(cached.manifest.clone());
196 }
197 }
198
199 let client = reqwest::Client::new();
200 let mut req = client.get(&url);
201 if let Some(cached) = &cached {
202 if let Some(etag) = &cached.etag {
203 req = req.header(reqwest::header::IF_NONE_MATCH, etag);
204 }
205 }
206
207 let resp = req.send().await?;
208 if resp.status() == reqwest::StatusCode::NOT_MODIFIED {
209 if let Some(cached) = cached {
210 return Ok(cached.manifest);
211 }
212 }
213
214 let status = resp.status();
215 let etag = resp
216 .headers()
217 .get(reqwest::header::ETAG)
218 .and_then(|v| v.to_str().ok())
219 .map(str::to_string);
220
221 if !status.is_success() {
222 if let Some(cached) = cached {
223 return Ok(cached.manifest);
224 }
225 anyhow::bail!("remote registry fetch failed: {status}");
226 }
227
228 let body = resp.text().await?;
229 let manifest = toml::from_str::<SourceManifest>(&body).context("invalid remote manifest")?;
230 validate_manifest(&manifest)?;
231 let snapshot = CachedRegistry {
232 fetched_at: Utc::now(),
233 etag,
234 manifest: manifest.clone(),
235 };
236 fs::write(cache_path, serde_json::to_vec_pretty(&snapshot)?)?;
237 Ok(manifest)
238}
239
240fn read_cache(path: &Path) -> Result<CachedRegistry> {
241 let bytes = fs::read(path)?;
242 Ok(serde_json::from_slice(&bytes)?)
243}
244
245pub fn discover_files(source: &SourceDef) -> Result<Vec<PathBuf>> {
246 validate_source(source)?;
247 let mut files = Vec::new();
248 let max_files = 5000usize;
249 let max_file_bytes = 20 * 1024 * 1024u64;
250
251 for root in &source.roots {
252 let expanded = expand_tilde(root);
253 if !is_root_allowlisted(&expanded) {
254 continue;
255 }
256 if !expanded.exists() {
257 continue;
258 }
259
260 for entry in ignore::WalkBuilder::new(&expanded)
261 .hidden(false)
262 .git_ignore(false)
263 .build()
264 {
265 let entry = match entry {
266 Ok(e) => e,
267 Err(_) => continue,
268 };
269
270 if !entry.file_type().map(|f| f.is_file()).unwrap_or(false) {
271 continue;
272 }
273
274 let relative = entry.path().strip_prefix(&expanded).unwrap_or(entry.path());
275 if matches_any_glob(relative, &source.globs) {
276 if let Ok(md) = entry.metadata() {
277 if md.len() > max_file_bytes {
278 continue;
279 }
280 }
281 files.push(entry.path().to_path_buf());
282 if files.len() >= max_files {
283 break;
284 }
285 }
286 }
287 if files.len() >= max_files {
288 break;
289 }
290 }
291
292 files.sort();
293 files.dedup();
294 Ok(files)
295}
296
297fn matches_any_glob(path: &Path, globs: &[String]) -> bool {
298 let path_text = path.to_string_lossy();
299 globs
300 .iter()
301 .filter_map(|g| Pattern::new(g).ok())
302 .any(|p| p.matches(&path_text))
303}
304
305pub fn validate_manifest(manifest: &SourceManifest) -> Result<()> {
306 if manifest.sources.is_empty() {
307 anyhow::bail!("manifest must contain at least one source");
308 }
309 if let Some(version) = manifest.version {
310 if version != 1 {
311 anyhow::bail!("unsupported manifest version: {version} (expected 1)");
312 }
313 }
314
315 let mut counts: HashMap<&str, usize> = HashMap::new();
316 for source in &manifest.sources {
317 *counts.entry(source.id.as_str()).or_insert(0) += 1;
318 validate_source(source)?;
319 }
320
321 let duplicates = counts
322 .into_iter()
323 .filter_map(|(id, n)| if n > 1 { Some(id.to_string()) } else { None })
324 .collect::<Vec<_>>();
325 if !duplicates.is_empty() {
326 anyhow::bail!(
327 "duplicate source ids in manifest: {}",
328 duplicates.join(", ")
329 );
330 }
331
332 Ok(())
333}
334
335pub fn validate_source(source: &SourceDef) -> Result<()> {
336 if source.id.trim().is_empty() {
337 anyhow::bail!("source id cannot be empty");
338 }
339 let valid_id = source
340 .id
341 .chars()
342 .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-' || c == '.');
343 if !valid_id {
344 anyhow::bail!("source id contains invalid characters");
345 }
346
347 if source.roots.is_empty() {
348 anyhow::bail!("source must declare at least one root");
349 }
350 if source.globs.is_empty() {
351 anyhow::bail!("source must declare at least one glob");
352 }
353 if source.format.trim().is_empty() {
354 anyhow::bail!("source format cannot be empty");
355 }
356 if !matches!(source.format.as_str(), "jsonl" | "json" | "mixed") {
357 anyhow::bail!("source format must be one of: jsonl, json, mixed");
358 }
359
360 for root in &source.roots {
361 let expanded = expand_tilde(root);
362 if has_parent_traversal(&expanded) {
363 anyhow::bail!("root has path traversal segments");
364 }
365 if !is_root_allowlisted(&expanded) {
366 anyhow::bail!(
367 "root is outside allowlisted user locations: {}",
368 expanded.display()
369 );
370 }
371 }
372
373 for g in &source.globs {
374 Pattern::new(g).with_context(|| format!("invalid glob pattern: {g}"))?;
375 if g.contains("..") {
376 anyhow::bail!("glob cannot include parent traversal");
377 }
378 }
379
380 Ok(())
381}
382
383fn has_parent_traversal(path: &Path) -> bool {
384 path.components().any(|c| matches!(c, Component::ParentDir))
385}
386
387fn is_root_allowlisted(path: &Path) -> bool {
388 allowlisted_roots()
389 .into_iter()
390 .any(|root| path_starts_with(path, &root))
391}
392
393fn expand_tilde(input: &str) -> PathBuf {
394 if input == "~" || input.starts_with("~/") || input.starts_with("~\\") {
395 if let Some(home) = resolve_home_dir() {
396 return PathBuf::from(input.replacen('~', home.to_string_lossy().as_ref(), 1));
397 }
398 }
399 PathBuf::from(input)
400}
401
402fn allowlisted_roots() -> Vec<PathBuf> {
403 let mut roots = Vec::new();
404 if let Some(home) = resolve_home_dir() {
405 roots.push(home);
406 }
407 if let Ok(appdata) = env::var("APPDATA") {
408 roots.push(PathBuf::from(appdata));
409 }
410 if let Ok(local_appdata) = env::var("LOCALAPPDATA") {
411 roots.push(PathBuf::from(local_appdata));
412 }
413 roots
414}
415
416fn resolve_home_dir() -> Option<PathBuf> {
417 env::var("HOME")
418 .ok()
419 .map(PathBuf::from)
420 .or_else(|| env::var("USERPROFILE").ok().map(PathBuf::from))
421}
422
423fn path_starts_with(path: &Path, root: &Path) -> bool {
424 #[cfg(windows)]
425 {
426 let p = path.to_string_lossy().to_lowercase();
427 let r = root.to_string_lossy().to_lowercase();
428 return p == r
429 || p.strip_prefix(&(r.clone() + "\\")).is_some()
430 || p.strip_prefix(&(r + "/")).is_some();
431 }
432
433 #[cfg(not(windows))]
434 {
435 path.starts_with(root)
436 }
437}
438
439#[cfg(test)]
440mod tests {
441 use std::time::{SystemTime, UNIX_EPOCH};
442
443 use crate::config::AppConfig;
444
445 use super::{
446 SourceDef, SourceManifest, add_local_source, load_local_sources, validate_manifest,
447 validate_source,
448 };
449
450 #[test]
451 fn add_local_source_persists_manifest() {
452 let mut cfg = AppConfig::default();
453 let nonce = SystemTime::now()
454 .duration_since(UNIX_EPOCH)
455 .expect("clock drift")
456 .as_nanos();
457 let test_path = std::env::temp_dir().join(format!("trace-share-sources-{nonce}.toml"));
458 cfg.sources_path = Some(test_path.clone());
459
460 let src = SourceDef {
461 id: "demo_source".to_string(),
462 display_name: Some("Demo".to_string()),
463 roots: vec!["~/demo".to_string()],
464 globs: vec!["**/*.jsonl".to_string()],
465 format: "jsonl".to_string(),
466 parser_hint: Some("generic".to_string()),
467 platforms: None,
468 requires_opt_in: Some(true),
469 };
470
471 add_local_source(&cfg, src).expect("add source");
472 let loaded = load_local_sources(&cfg)
473 .expect("load manifest")
474 .expect("manifest exists");
475 assert!(loaded.sources.iter().any(|s| s.id == "demo_source"));
476
477 let _ = std::fs::remove_file(test_path);
478 }
479
480 #[test]
481 fn rejects_invalid_source_id() {
482 let src = SourceDef {
483 id: "bad id".to_string(),
484 display_name: None,
485 roots: vec!["~/demo".to_string()],
486 globs: vec!["**/*.jsonl".to_string()],
487 format: "jsonl".to_string(),
488 parser_hint: None,
489 platforms: None,
490 requires_opt_in: None,
491 };
492 assert!(validate_source(&src).is_err());
493 }
494
495 #[test]
496 fn rejects_duplicate_source_ids_in_manifest() {
497 let source = SourceDef {
498 id: "dup_source".to_string(),
499 display_name: None,
500 roots: vec!["~/.codex/sessions".to_string()],
501 globs: vec!["**/*.jsonl".to_string()],
502 format: "jsonl".to_string(),
503 parser_hint: None,
504 platforms: None,
505 requires_opt_in: Some(false),
506 };
507 let manifest = SourceManifest {
508 version: Some(1),
509 sources: vec![source.clone(), source],
510 };
511 assert!(validate_manifest(&manifest).is_err());
512 }
513}