Skip to main content

trace_share_core/
sources.rs

1use anyhow::{Context, Result};
2use chrono::{DateTime, Utc};
3use glob::Pattern;
4use serde::{Deserialize, Serialize};
5use std::{
6    collections::HashMap,
7    collections::HashSet,
8    env, fs,
9    path::{Component, Path, PathBuf},
10};
11use tracing::warn;
12
13use crate::config::{AppConfig, data_dir, default_sources_path};
14
15#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct SourceDef {
17    pub id: String,
18    pub display_name: Option<String>,
19    pub roots: Vec<String>,
20    pub globs: Vec<String>,
21    pub format: String,
22    pub parser_hint: Option<String>,
23    pub platforms: Option<Vec<String>>,
24    pub requires_opt_in: Option<bool>,
25}
26
27#[derive(Debug, Clone, Default, Serialize, Deserialize)]
28pub struct SourceManifest {
29    pub version: Option<u32>,
30    pub sources: Vec<SourceDef>,
31}
32
33#[derive(Debug, Clone, Serialize, Deserialize)]
34struct CachedRegistry {
35    fetched_at: DateTime<Utc>,
36    etag: Option<String>,
37    manifest: SourceManifest,
38}
39
40pub fn builtin_sources() -> Vec<SourceDef> {
41    vec![
42        SourceDef {
43            id: "codex_cli".to_string(),
44            display_name: Some("Codex CLI".to_string()),
45            roots: vec!["~/.codex/sessions".to_string()],
46            globs: vec!["**/*".to_string()],
47            format: "jsonl".to_string(),
48            parser_hint: Some("codex_cli_v1".to_string()),
49            platforms: None,
50            requires_opt_in: Some(false),
51        },
52        SourceDef {
53            id: "claude_code".to_string(),
54            display_name: Some("Claude Code".to_string()),
55            roots: vec!["~/.claude/projects".to_string()],
56            globs: vec!["**/sessions/*.jsonl".to_string()],
57            format: "jsonl".to_string(),
58            parser_hint: Some("claude_code_v1".to_string()),
59            platforms: None,
60            requires_opt_in: Some(false),
61        },
62        SourceDef {
63            id: "vscode_global_storage".to_string(),
64            display_name: Some("VS Code Global Storage".to_string()),
65            roots: vec![
66                "~/.config/Code/User/globalStorage".to_string(),
67                "~/Library/Application Support/Code/User/globalStorage".to_string(),
68                "~/AppData/Roaming/Code/User/globalStorage".to_string(),
69            ],
70            globs: vec!["**/*.jsonl".to_string(), "**/*.json".to_string()],
71            format: "jsonl".to_string(),
72            parser_hint: Some("vscode_storage_v1".to_string()),
73            platforms: None,
74            requires_opt_in: Some(false),
75        },
76        SourceDef {
77            id: "tandem_sessions".to_string(),
78            display_name: Some("Tandem Sessions".to_string()),
79            roots: vec![
80                "~/.local/share/tandem/data/storage".to_string(),
81                "~/Library/Application Support/tandem/data/storage".to_string(),
82                "~/AppData/Roaming/tandem/data/storage".to_string(),
83            ],
84            globs: vec!["**/sessions.json".to_string()],
85            format: "json".to_string(),
86            parser_hint: Some("tandem_v1".to_string()),
87            platforms: Some(vec![
88                "linux".to_string(),
89                "macos".to_string(),
90                "windows".to_string(),
91            ]),
92            requires_opt_in: Some(true),
93        },
94    ]
95}
96
97pub async fn resolve_sources(config: &AppConfig) -> Result<Vec<SourceDef>> {
98    let mut merged = builtin_sources();
99
100    if config.remote_registry.enabled {
101        if let Ok(remote) = load_remote_registry(config).await {
102            merged.extend(remote.sources);
103        }
104    }
105
106    if let Some(local) = load_local_sources(config)? {
107        merged.extend(local.sources);
108    }
109
110    let merged = merge_with_override(merged);
111    let mut valid = Vec::new();
112    for source in merged {
113        match validate_source(&source) {
114            Ok(()) => valid.push(source),
115            Err(e) => warn!("skipping unsafe source {}: {e}", source.id),
116        }
117    }
118    Ok(valid)
119}
120
121fn merge_with_override(input: Vec<SourceDef>) -> Vec<SourceDef> {
122    let mut out = Vec::new();
123    let mut seen = HashSet::new();
124    for source in input.into_iter().rev() {
125        if seen.insert(source.id.clone()) {
126            out.push(source);
127        }
128    }
129    out.reverse();
130    out
131}
132
133pub fn load_local_sources(config: &AppConfig) -> Result<Option<SourceManifest>> {
134    let p = config
135        .sources_path
136        .clone()
137        .unwrap_or(default_sources_path()?);
138    if !p.exists() {
139        return Ok(None);
140    }
141    let text = fs::read_to_string(&p)
142        .with_context(|| format!("failed reading sources file {}", p.display()))?;
143    let manifest = toml::from_str::<SourceManifest>(&text).context("invalid local sources.toml")?;
144    validate_manifest(&manifest)?;
145    Ok(Some(manifest))
146}
147
148pub fn add_local_source(config: &AppConfig, source: SourceDef) -> Result<PathBuf> {
149    validate_source(&source)?;
150    let path = config
151        .sources_path
152        .clone()
153        .unwrap_or(default_sources_path()?);
154    if let Some(parent) = path.parent() {
155        fs::create_dir_all(parent)?;
156    }
157
158    let mut manifest = if path.exists() {
159        let text = fs::read_to_string(&path)
160            .with_context(|| format!("failed reading sources file {}", path.display()))?;
161        toml::from_str::<SourceManifest>(&text).context("invalid local sources.toml")?
162    } else {
163        SourceManifest {
164            version: Some(1),
165            sources: Vec::new(),
166        }
167    };
168
169    if let Some(existing) = manifest.sources.iter_mut().find(|s| s.id == source.id) {
170        *existing = source;
171    } else {
172        manifest.sources.push(source);
173    }
174
175    manifest.sources.sort_by(|a, b| a.id.cmp(&b.id));
176    let text = toml::to_string_pretty(&manifest)?;
177    fs::write(&path, text)?;
178    Ok(path)
179}
180
181pub async fn load_remote_registry(config: &AppConfig) -> Result<SourceManifest> {
182    let url = config
183        .remote_registry
184        .url
185        .clone()
186        .context("remote registry url missing")?;
187
188    let cache_path = data_dir()?.join("registry-cache.json");
189    let cached = read_cache(&cache_path).ok();
190    let ttl_hours = config.remote_registry.cache_ttl_hours.max(1);
191
192    if let Some(cached) = &cached {
193        let age = Utc::now() - cached.fetched_at;
194        if age.num_hours() < ttl_hours as i64 {
195            return Ok(cached.manifest.clone());
196        }
197    }
198
199    let client = reqwest::Client::new();
200    let mut req = client.get(&url);
201    if let Some(cached) = &cached {
202        if let Some(etag) = &cached.etag {
203            req = req.header(reqwest::header::IF_NONE_MATCH, etag);
204        }
205    }
206
207    let resp = req.send().await?;
208    if resp.status() == reqwest::StatusCode::NOT_MODIFIED {
209        if let Some(cached) = cached {
210            return Ok(cached.manifest);
211        }
212    }
213
214    let status = resp.status();
215    let etag = resp
216        .headers()
217        .get(reqwest::header::ETAG)
218        .and_then(|v| v.to_str().ok())
219        .map(str::to_string);
220
221    if !status.is_success() {
222        if let Some(cached) = cached {
223            return Ok(cached.manifest);
224        }
225        anyhow::bail!("remote registry fetch failed: {status}");
226    }
227
228    let body = resp.text().await?;
229    let manifest = toml::from_str::<SourceManifest>(&body).context("invalid remote manifest")?;
230    validate_manifest(&manifest)?;
231    let snapshot = CachedRegistry {
232        fetched_at: Utc::now(),
233        etag,
234        manifest: manifest.clone(),
235    };
236    fs::write(cache_path, serde_json::to_vec_pretty(&snapshot)?)?;
237    Ok(manifest)
238}
239
240fn read_cache(path: &Path) -> Result<CachedRegistry> {
241    let bytes = fs::read(path)?;
242    Ok(serde_json::from_slice(&bytes)?)
243}
244
245pub fn discover_files(source: &SourceDef) -> Result<Vec<PathBuf>> {
246    validate_source(source)?;
247    let mut files = Vec::new();
248    let max_files = 5000usize;
249    let max_file_bytes = 20 * 1024 * 1024u64;
250
251    for root in &source.roots {
252        let expanded = expand_tilde(root);
253        if !is_root_allowlisted(&expanded) {
254            continue;
255        }
256        if !expanded.exists() {
257            continue;
258        }
259
260        for entry in ignore::WalkBuilder::new(&expanded)
261            .hidden(false)
262            .git_ignore(false)
263            .build()
264        {
265            let entry = match entry {
266                Ok(e) => e,
267                Err(_) => continue,
268            };
269
270            if !entry.file_type().map(|f| f.is_file()).unwrap_or(false) {
271                continue;
272            }
273
274            let relative = entry.path().strip_prefix(&expanded).unwrap_or(entry.path());
275            if matches_any_glob(relative, &source.globs) {
276                if let Ok(md) = entry.metadata() {
277                    if md.len() > max_file_bytes {
278                        continue;
279                    }
280                }
281                files.push(entry.path().to_path_buf());
282                if files.len() >= max_files {
283                    break;
284                }
285            }
286        }
287        if files.len() >= max_files {
288            break;
289        }
290    }
291
292    files.sort();
293    files.dedup();
294    Ok(files)
295}
296
297fn matches_any_glob(path: &Path, globs: &[String]) -> bool {
298    let path_text = path.to_string_lossy();
299    globs
300        .iter()
301        .filter_map(|g| Pattern::new(g).ok())
302        .any(|p| p.matches(&path_text))
303}
304
305pub fn validate_manifest(manifest: &SourceManifest) -> Result<()> {
306    if manifest.sources.is_empty() {
307        anyhow::bail!("manifest must contain at least one source");
308    }
309    if let Some(version) = manifest.version {
310        if version != 1 {
311            anyhow::bail!("unsupported manifest version: {version} (expected 1)");
312        }
313    }
314
315    let mut counts: HashMap<&str, usize> = HashMap::new();
316    for source in &manifest.sources {
317        *counts.entry(source.id.as_str()).or_insert(0) += 1;
318        validate_source(source)?;
319    }
320
321    let duplicates = counts
322        .into_iter()
323        .filter_map(|(id, n)| if n > 1 { Some(id.to_string()) } else { None })
324        .collect::<Vec<_>>();
325    if !duplicates.is_empty() {
326        anyhow::bail!(
327            "duplicate source ids in manifest: {}",
328            duplicates.join(", ")
329        );
330    }
331
332    Ok(())
333}
334
335pub fn validate_source(source: &SourceDef) -> Result<()> {
336    if source.id.trim().is_empty() {
337        anyhow::bail!("source id cannot be empty");
338    }
339    let valid_id = source
340        .id
341        .chars()
342        .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-' || c == '.');
343    if !valid_id {
344        anyhow::bail!("source id contains invalid characters");
345    }
346
347    if source.roots.is_empty() {
348        anyhow::bail!("source must declare at least one root");
349    }
350    if source.globs.is_empty() {
351        anyhow::bail!("source must declare at least one glob");
352    }
353    if source.format.trim().is_empty() {
354        anyhow::bail!("source format cannot be empty");
355    }
356    if !matches!(source.format.as_str(), "jsonl" | "json" | "mixed") {
357        anyhow::bail!("source format must be one of: jsonl, json, mixed");
358    }
359
360    for root in &source.roots {
361        let expanded = expand_tilde(root);
362        if has_parent_traversal(&expanded) {
363            anyhow::bail!("root has path traversal segments");
364        }
365        if !is_root_allowlisted(&expanded) {
366            anyhow::bail!(
367                "root is outside allowlisted user locations: {}",
368                expanded.display()
369            );
370        }
371    }
372
373    for g in &source.globs {
374        Pattern::new(g).with_context(|| format!("invalid glob pattern: {g}"))?;
375        if g.contains("..") {
376            anyhow::bail!("glob cannot include parent traversal");
377        }
378    }
379
380    Ok(())
381}
382
383fn has_parent_traversal(path: &Path) -> bool {
384    path.components().any(|c| matches!(c, Component::ParentDir))
385}
386
387fn is_root_allowlisted(path: &Path) -> bool {
388    allowlisted_roots()
389        .into_iter()
390        .any(|root| path_starts_with(path, &root))
391}
392
393fn expand_tilde(input: &str) -> PathBuf {
394    if input == "~" || input.starts_with("~/") || input.starts_with("~\\") {
395        if let Some(home) = resolve_home_dir() {
396            return PathBuf::from(input.replacen('~', home.to_string_lossy().as_ref(), 1));
397        }
398    }
399    PathBuf::from(input)
400}
401
402fn allowlisted_roots() -> Vec<PathBuf> {
403    let mut roots = Vec::new();
404    if let Some(home) = resolve_home_dir() {
405        roots.push(home);
406    }
407    if let Ok(appdata) = env::var("APPDATA") {
408        roots.push(PathBuf::from(appdata));
409    }
410    if let Ok(local_appdata) = env::var("LOCALAPPDATA") {
411        roots.push(PathBuf::from(local_appdata));
412    }
413    roots
414}
415
416fn resolve_home_dir() -> Option<PathBuf> {
417    env::var("HOME")
418        .ok()
419        .map(PathBuf::from)
420        .or_else(|| env::var("USERPROFILE").ok().map(PathBuf::from))
421}
422
423fn path_starts_with(path: &Path, root: &Path) -> bool {
424    #[cfg(windows)]
425    {
426        let p = path.to_string_lossy().to_lowercase();
427        let r = root.to_string_lossy().to_lowercase();
428        return p == r
429            || p.strip_prefix(&(r.clone() + "\\")).is_some()
430            || p.strip_prefix(&(r + "/")).is_some();
431    }
432
433    #[cfg(not(windows))]
434    {
435        path.starts_with(root)
436    }
437}
438
439#[cfg(test)]
440mod tests {
441    use std::time::{SystemTime, UNIX_EPOCH};
442
443    use crate::config::AppConfig;
444
445    use super::{
446        SourceDef, SourceManifest, add_local_source, load_local_sources, validate_manifest,
447        validate_source,
448    };
449
450    #[test]
451    fn add_local_source_persists_manifest() {
452        let mut cfg = AppConfig::default();
453        let nonce = SystemTime::now()
454            .duration_since(UNIX_EPOCH)
455            .expect("clock drift")
456            .as_nanos();
457        let test_path = std::env::temp_dir().join(format!("trace-share-sources-{nonce}.toml"));
458        cfg.sources_path = Some(test_path.clone());
459
460        let src = SourceDef {
461            id: "demo_source".to_string(),
462            display_name: Some("Demo".to_string()),
463            roots: vec!["~/demo".to_string()],
464            globs: vec!["**/*.jsonl".to_string()],
465            format: "jsonl".to_string(),
466            parser_hint: Some("generic".to_string()),
467            platforms: None,
468            requires_opt_in: Some(true),
469        };
470
471        add_local_source(&cfg, src).expect("add source");
472        let loaded = load_local_sources(&cfg)
473            .expect("load manifest")
474            .expect("manifest exists");
475        assert!(loaded.sources.iter().any(|s| s.id == "demo_source"));
476
477        let _ = std::fs::remove_file(test_path);
478    }
479
480    #[test]
481    fn rejects_invalid_source_id() {
482        let src = SourceDef {
483            id: "bad id".to_string(),
484            display_name: None,
485            roots: vec!["~/demo".to_string()],
486            globs: vec!["**/*.jsonl".to_string()],
487            format: "jsonl".to_string(),
488            parser_hint: None,
489            platforms: None,
490            requires_opt_in: None,
491        };
492        assert!(validate_source(&src).is_err());
493    }
494
495    #[test]
496    fn rejects_duplicate_source_ids_in_manifest() {
497        let source = SourceDef {
498            id: "dup_source".to_string(),
499            display_name: None,
500            roots: vec!["~/.codex/sessions".to_string()],
501            globs: vec!["**/*.jsonl".to_string()],
502            format: "jsonl".to_string(),
503            parser_hint: None,
504            platforms: None,
505            requires_opt_in: Some(false),
506        };
507        let manifest = SourceManifest {
508            version: Some(1),
509            sources: vec![source.clone(), source],
510        };
511        assert!(validate_manifest(&manifest).is_err());
512    }
513}