1use anyhow::{Context, Result};
2use chrono::{DateTime, Utc};
3use glob::Pattern;
4use serde::{Deserialize, Serialize};
5use std::{
6 collections::HashMap,
7 collections::HashSet,
8 env, fs,
9 path::{Component, Path, PathBuf},
10};
11use tracing::warn;
12
13use crate::config::{AppConfig, data_dir, default_sources_path};
14use crate::security::{ensure_secure_url, write_private_file};
15
16#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct SourceDef {
18 pub id: String,
19 pub display_name: Option<String>,
20 pub roots: Vec<String>,
21 pub globs: Vec<String>,
22 pub format: String,
23 pub parser_hint: Option<String>,
24 pub platforms: Option<Vec<String>>,
25 pub requires_opt_in: Option<bool>,
26}
27
28#[derive(Debug, Clone, Default, Serialize, Deserialize)]
29pub struct SourceManifest {
30 pub version: Option<u32>,
31 pub sources: Vec<SourceDef>,
32}
33
34#[derive(Debug, Clone, Serialize, Deserialize)]
35struct CachedRegistry {
36 fetched_at: DateTime<Utc>,
37 etag: Option<String>,
38 manifest: SourceManifest,
39}
40
41pub fn builtin_sources() -> Vec<SourceDef> {
42 vec![
43 SourceDef {
44 id: "codex_cli".to_string(),
45 display_name: Some("Codex CLI".to_string()),
46 roots: vec!["~/.codex/sessions".to_string()],
47 globs: vec!["**/*".to_string()],
48 format: "jsonl".to_string(),
49 parser_hint: Some("codex_cli_v1".to_string()),
50 platforms: None,
51 requires_opt_in: Some(false),
52 },
53 SourceDef {
54 id: "claude_code".to_string(),
55 display_name: Some("Claude Code".to_string()),
56 roots: vec!["~/.claude/projects".to_string()],
57 globs: vec!["**/sessions/*.jsonl".to_string()],
58 format: "jsonl".to_string(),
59 parser_hint: Some("claude_code_v1".to_string()),
60 platforms: None,
61 requires_opt_in: Some(false),
62 },
63 SourceDef {
64 id: "vscode_global_storage".to_string(),
65 display_name: Some("VS Code Global Storage".to_string()),
66 roots: vec![
67 "~/.config/Code/User/globalStorage".to_string(),
68 "~/Library/Application Support/Code/User/globalStorage".to_string(),
69 "~/AppData/Roaming/Code/User/globalStorage".to_string(),
70 ],
71 globs: vec!["**/*.jsonl".to_string(), "**/*.json".to_string()],
72 format: "jsonl".to_string(),
73 parser_hint: Some("vscode_storage_v1".to_string()),
74 platforms: None,
75 requires_opt_in: Some(false),
76 },
77 SourceDef {
78 id: "tandem_sessions".to_string(),
79 display_name: Some("Tandem Sessions".to_string()),
80 roots: vec![
81 "~/.local/share/tandem/data/storage".to_string(),
82 "~/Library/Application Support/tandem/data/storage".to_string(),
83 "~/AppData/Roaming/tandem/data/storage".to_string(),
84 ],
85 globs: vec!["**/sessions.json".to_string()],
86 format: "json".to_string(),
87 parser_hint: Some("tandem_v1".to_string()),
88 platforms: Some(vec![
89 "linux".to_string(),
90 "macos".to_string(),
91 "windows".to_string(),
92 ]),
93 requires_opt_in: Some(true),
94 },
95 ]
96}
97
98pub async fn resolve_sources(config: &AppConfig) -> Result<Vec<SourceDef>> {
99 let mut merged = builtin_sources();
100
101 if config.remote_registry.enabled {
102 if let Ok(remote) = load_remote_registry(config).await {
103 merged.extend(remote.sources);
104 }
105 }
106
107 if let Some(local) = load_local_sources(config)? {
108 merged.extend(local.sources);
109 }
110
111 let merged = merge_with_override(merged);
112 let mut valid = Vec::new();
113 for source in merged {
114 match validate_source(&source) {
115 Ok(()) => valid.push(source),
116 Err(e) => warn!("skipping unsafe source {}: {e}", source.id),
117 }
118 }
119 Ok(valid)
120}
121
122fn merge_with_override(input: Vec<SourceDef>) -> Vec<SourceDef> {
123 let mut out = Vec::new();
124 let mut seen = HashSet::new();
125 for source in input.into_iter().rev() {
126 if seen.insert(source.id.clone()) {
127 out.push(source);
128 }
129 }
130 out.reverse();
131 out
132}
133
134pub fn load_local_sources(config: &AppConfig) -> Result<Option<SourceManifest>> {
135 let p = config
136 .sources_path
137 .clone()
138 .unwrap_or(default_sources_path()?);
139 if !p.exists() {
140 return Ok(None);
141 }
142 let text = fs::read_to_string(&p)
143 .with_context(|| format!("failed reading sources file {}", p.display()))?;
144 let manifest = toml::from_str::<SourceManifest>(&text).context("invalid local sources.toml")?;
145 validate_manifest(&manifest)?;
146 Ok(Some(manifest))
147}
148
149pub fn add_local_source(config: &AppConfig, source: SourceDef) -> Result<PathBuf> {
150 validate_source(&source)?;
151 let path = config
152 .sources_path
153 .clone()
154 .unwrap_or(default_sources_path()?);
155 if let Some(parent) = path.parent() {
156 fs::create_dir_all(parent)?;
157 }
158
159 let mut manifest = if path.exists() {
160 let text = fs::read_to_string(&path)
161 .with_context(|| format!("failed reading sources file {}", path.display()))?;
162 toml::from_str::<SourceManifest>(&text).context("invalid local sources.toml")?
163 } else {
164 SourceManifest {
165 version: Some(1),
166 sources: Vec::new(),
167 }
168 };
169
170 if let Some(existing) = manifest.sources.iter_mut().find(|s| s.id == source.id) {
171 *existing = source;
172 } else {
173 manifest.sources.push(source);
174 }
175
176 manifest.sources.sort_by(|a, b| a.id.cmp(&b.id));
177 let text = toml::to_string_pretty(&manifest)?;
178 write_private_file(&path, text.as_bytes())?;
179 Ok(path)
180}
181
182pub async fn load_remote_registry(config: &AppConfig) -> Result<SourceManifest> {
183 let url = config
184 .remote_registry
185 .url
186 .clone()
187 .context("remote registry url missing")?;
188 ensure_secure_url(&url, "remote registry URL")?;
189
190 let cache_path = data_dir()?.join("registry-cache.json");
191 let cached = read_cache(&cache_path).ok();
192 let ttl_hours = config.remote_registry.cache_ttl_hours.max(1);
193
194 if let Some(cached) = &cached {
195 let age = Utc::now() - cached.fetched_at;
196 if age.num_hours() < ttl_hours as i64 {
197 return Ok(cached.manifest.clone());
198 }
199 }
200
201 let client = reqwest::Client::new();
202 let mut req = client.get(&url);
203 if let Some(cached) = &cached {
204 if let Some(etag) = &cached.etag {
205 req = req.header(reqwest::header::IF_NONE_MATCH, etag);
206 }
207 }
208
209 let resp = req.send().await?;
210 if resp.status() == reqwest::StatusCode::NOT_MODIFIED {
211 if let Some(cached) = cached {
212 return Ok(cached.manifest);
213 }
214 }
215
216 let status = resp.status();
217 let etag = resp
218 .headers()
219 .get(reqwest::header::ETAG)
220 .and_then(|v| v.to_str().ok())
221 .map(str::to_string);
222
223 if !status.is_success() {
224 if let Some(cached) = cached {
225 return Ok(cached.manifest);
226 }
227 anyhow::bail!("remote registry fetch failed: {status}");
228 }
229
230 let body = resp.text().await?;
231 let manifest = toml::from_str::<SourceManifest>(&body).context("invalid remote manifest")?;
232 validate_manifest(&manifest)?;
233 let snapshot = CachedRegistry {
234 fetched_at: Utc::now(),
235 etag,
236 manifest: manifest.clone(),
237 };
238 let bytes = serde_json::to_vec_pretty(&snapshot)?;
239 write_private_file(&cache_path, &bytes)?;
240 Ok(manifest)
241}
242
243fn read_cache(path: &Path) -> Result<CachedRegistry> {
244 let bytes = fs::read(path)?;
245 Ok(serde_json::from_slice(&bytes)?)
246}
247
248pub fn discover_files(source: &SourceDef) -> Result<Vec<PathBuf>> {
249 validate_source(source)?;
250 let mut files = Vec::new();
251 let max_files = 5000usize;
252 let max_file_bytes = 20 * 1024 * 1024u64;
253
254 for root in &source.roots {
255 let expanded = expand_tilde(root);
256 if !is_root_allowlisted(&expanded) {
257 continue;
258 }
259 if !expanded.exists() {
260 continue;
261 }
262
263 for entry in ignore::WalkBuilder::new(&expanded)
264 .hidden(false)
265 .git_ignore(false)
266 .build()
267 {
268 let entry = match entry {
269 Ok(e) => e,
270 Err(_) => continue,
271 };
272
273 if !entry.file_type().map(|f| f.is_file()).unwrap_or(false) {
274 continue;
275 }
276
277 let relative = entry.path().strip_prefix(&expanded).unwrap_or(entry.path());
278 if matches_any_glob(relative, &source.globs) {
279 if let Ok(md) = entry.metadata() {
280 if md.len() > max_file_bytes {
281 continue;
282 }
283 }
284 files.push(entry.path().to_path_buf());
285 if files.len() >= max_files {
286 break;
287 }
288 }
289 }
290 if files.len() >= max_files {
291 break;
292 }
293 }
294
295 files.sort();
296 files.dedup();
297 Ok(files)
298}
299
300fn matches_any_glob(path: &Path, globs: &[String]) -> bool {
301 let path_text = path.to_string_lossy();
302 globs
303 .iter()
304 .filter_map(|g| Pattern::new(g).ok())
305 .any(|p| p.matches(&path_text))
306}
307
308pub fn validate_manifest(manifest: &SourceManifest) -> Result<()> {
309 if manifest.sources.is_empty() {
310 anyhow::bail!("manifest must contain at least one source");
311 }
312 if let Some(version) = manifest.version {
313 if version != 1 {
314 anyhow::bail!("unsupported manifest version: {version} (expected 1)");
315 }
316 }
317
318 let mut counts: HashMap<&str, usize> = HashMap::new();
319 for source in &manifest.sources {
320 *counts.entry(source.id.as_str()).or_insert(0) += 1;
321 validate_source(source)?;
322 }
323
324 let duplicates = counts
325 .into_iter()
326 .filter_map(|(id, n)| if n > 1 { Some(id.to_string()) } else { None })
327 .collect::<Vec<_>>();
328 if !duplicates.is_empty() {
329 anyhow::bail!(
330 "duplicate source ids in manifest: {}",
331 duplicates.join(", ")
332 );
333 }
334
335 Ok(())
336}
337
338pub fn validate_source(source: &SourceDef) -> Result<()> {
339 if source.id.trim().is_empty() {
340 anyhow::bail!("source id cannot be empty");
341 }
342 let valid_id = source
343 .id
344 .chars()
345 .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-' || c == '.');
346 if !valid_id {
347 anyhow::bail!("source id contains invalid characters");
348 }
349
350 if source.roots.is_empty() {
351 anyhow::bail!("source must declare at least one root");
352 }
353 if source.globs.is_empty() {
354 anyhow::bail!("source must declare at least one glob");
355 }
356 if source.format.trim().is_empty() {
357 anyhow::bail!("source format cannot be empty");
358 }
359 if !matches!(source.format.as_str(), "jsonl" | "json" | "mixed") {
360 anyhow::bail!("source format must be one of: jsonl, json, mixed");
361 }
362
363 for root in &source.roots {
364 let expanded = expand_tilde(root);
365 if has_parent_traversal(&expanded) {
366 anyhow::bail!("root has path traversal segments");
367 }
368 if !is_root_allowlisted(&expanded) {
369 anyhow::bail!(
370 "root is outside allowlisted user locations: {}",
371 expanded.display()
372 );
373 }
374 }
375
376 for g in &source.globs {
377 Pattern::new(g).with_context(|| format!("invalid glob pattern: {g}"))?;
378 if g.contains("..") {
379 anyhow::bail!("glob cannot include parent traversal");
380 }
381 }
382
383 Ok(())
384}
385
386fn has_parent_traversal(path: &Path) -> bool {
387 path.components().any(|c| matches!(c, Component::ParentDir))
388}
389
390fn is_root_allowlisted(path: &Path) -> bool {
391 allowlisted_roots()
392 .into_iter()
393 .any(|root| path_starts_with(path, &root))
394}
395
396fn expand_tilde(input: &str) -> PathBuf {
397 if input == "~" || input.starts_with("~/") || input.starts_with("~\\") {
398 if let Some(home) = resolve_home_dir() {
399 return PathBuf::from(input.replacen('~', home.to_string_lossy().as_ref(), 1));
400 }
401 }
402 PathBuf::from(input)
403}
404
405fn allowlisted_roots() -> Vec<PathBuf> {
406 let mut roots = Vec::new();
407 if let Some(home) = resolve_home_dir() {
408 roots.push(home);
409 }
410 if let Ok(appdata) = env::var("APPDATA") {
411 roots.push(PathBuf::from(appdata));
412 }
413 if let Ok(local_appdata) = env::var("LOCALAPPDATA") {
414 roots.push(PathBuf::from(local_appdata));
415 }
416 roots
417}
418
419fn resolve_home_dir() -> Option<PathBuf> {
420 env::var("HOME")
421 .ok()
422 .map(PathBuf::from)
423 .or_else(|| env::var("USERPROFILE").ok().map(PathBuf::from))
424}
425
426fn path_starts_with(path: &Path, root: &Path) -> bool {
427 #[cfg(windows)]
428 {
429 let p = path.to_string_lossy().to_lowercase();
430 let r = root.to_string_lossy().to_lowercase();
431 return p == r
432 || p.strip_prefix(&(r.clone() + "\\")).is_some()
433 || p.strip_prefix(&(r + "/")).is_some();
434 }
435
436 #[cfg(not(windows))]
437 {
438 path.starts_with(root)
439 }
440}
441
442#[cfg(test)]
443mod tests {
444 use std::time::{SystemTime, UNIX_EPOCH};
445
446 use crate::config::AppConfig;
447
448 use super::{
449 SourceDef, SourceManifest, add_local_source, load_local_sources, validate_manifest,
450 validate_source,
451 };
452
453 #[test]
454 fn add_local_source_persists_manifest() {
455 let mut cfg = AppConfig::default();
456 let nonce = SystemTime::now()
457 .duration_since(UNIX_EPOCH)
458 .expect("clock drift")
459 .as_nanos();
460 let test_path = std::env::temp_dir().join(format!("trace-share-sources-{nonce}.toml"));
461 cfg.sources_path = Some(test_path.clone());
462
463 let src = SourceDef {
464 id: "demo_source".to_string(),
465 display_name: Some("Demo".to_string()),
466 roots: vec!["~/demo".to_string()],
467 globs: vec!["**/*.jsonl".to_string()],
468 format: "jsonl".to_string(),
469 parser_hint: Some("generic".to_string()),
470 platforms: None,
471 requires_opt_in: Some(true),
472 };
473
474 add_local_source(&cfg, src).expect("add source");
475 let loaded = load_local_sources(&cfg)
476 .expect("load manifest")
477 .expect("manifest exists");
478 assert!(loaded.sources.iter().any(|s| s.id == "demo_source"));
479
480 let _ = std::fs::remove_file(test_path);
481 }
482
483 #[test]
484 fn rejects_invalid_source_id() {
485 let src = SourceDef {
486 id: "bad id".to_string(),
487 display_name: None,
488 roots: vec!["~/demo".to_string()],
489 globs: vec!["**/*.jsonl".to_string()],
490 format: "jsonl".to_string(),
491 parser_hint: None,
492 platforms: None,
493 requires_opt_in: None,
494 };
495 assert!(validate_source(&src).is_err());
496 }
497
498 #[test]
499 fn rejects_duplicate_source_ids_in_manifest() {
500 let source = SourceDef {
501 id: "dup_source".to_string(),
502 display_name: None,
503 roots: vec!["~/.codex/sessions".to_string()],
504 globs: vec!["**/*.jsonl".to_string()],
505 format: "jsonl".to_string(),
506 parser_hint: None,
507 platforms: None,
508 requires_opt_in: Some(false),
509 };
510 let manifest = SourceManifest {
511 version: Some(1),
512 sources: vec![source.clone(), source],
513 };
514 assert!(validate_manifest(&manifest).is_err());
515 }
516}