Skip to main content

trace_share_core/
sources.rs

1use anyhow::{Context, Result};
2use chrono::{DateTime, Utc};
3use glob::Pattern;
4use serde::{Deserialize, Serialize};
5use std::{
6    collections::HashMap,
7    collections::HashSet,
8    env, fs,
9    path::{Component, Path, PathBuf},
10};
11use tracing::warn;
12
13use crate::config::{AppConfig, data_dir, default_sources_path};
14
15#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct SourceDef {
17    pub id: String,
18    pub display_name: Option<String>,
19    pub roots: Vec<String>,
20    pub globs: Vec<String>,
21    pub format: String,
22    pub parser_hint: Option<String>,
23    pub platforms: Option<Vec<String>>,
24    pub requires_opt_in: Option<bool>,
25}
26
27#[derive(Debug, Clone, Default, Serialize, Deserialize)]
28pub struct SourceManifest {
29    pub version: Option<u32>,
30    pub sources: Vec<SourceDef>,
31}
32
33#[derive(Debug, Clone, Serialize, Deserialize)]
34struct CachedRegistry {
35    fetched_at: DateTime<Utc>,
36    etag: Option<String>,
37    manifest: SourceManifest,
38}
39
40pub fn builtin_sources() -> Vec<SourceDef> {
41    vec![
42        SourceDef {
43            id: "codex_cli".to_string(),
44            display_name: Some("Codex CLI".to_string()),
45            roots: vec!["~/.codex/sessions".to_string()],
46            globs: vec!["**/*".to_string()],
47            format: "jsonl".to_string(),
48            parser_hint: Some("codex_cli_v1".to_string()),
49            platforms: None,
50            requires_opt_in: Some(false),
51        },
52        SourceDef {
53            id: "claude_code".to_string(),
54            display_name: Some("Claude Code".to_string()),
55            roots: vec!["~/.claude/projects".to_string()],
56            globs: vec!["**/sessions/*.jsonl".to_string()],
57            format: "jsonl".to_string(),
58            parser_hint: Some("claude_code_v1".to_string()),
59            platforms: None,
60            requires_opt_in: Some(false),
61        },
62        SourceDef {
63            id: "vscode_global_storage".to_string(),
64            display_name: Some("VS Code Global Storage".to_string()),
65            roots: vec![
66                "~/.config/Code/User/globalStorage".to_string(),
67                "~/Library/Application Support/Code/User/globalStorage".to_string(),
68                "~/AppData/Roaming/Code/User/globalStorage".to_string(),
69            ],
70            globs: vec!["**/*.jsonl".to_string(), "**/*.json".to_string()],
71            format: "jsonl".to_string(),
72            parser_hint: Some("vscode_storage_v1".to_string()),
73            platforms: None,
74            requires_opt_in: Some(false),
75        },
76        SourceDef {
77            id: "tandem_sessions".to_string(),
78            display_name: Some("Tandem Sessions".to_string()),
79            roots: vec![
80                "~/.local/share/tandem/data/storage".to_string(),
81                "~/Library/Application Support/tandem/data/storage".to_string(),
82                "~/AppData/Roaming/tandem/data/storage".to_string(),
83            ],
84            globs: vec!["**/sessions.json".to_string()],
85            format: "json".to_string(),
86            parser_hint: Some("tandem_v1".to_string()),
87            platforms: Some(vec![
88                "linux".to_string(),
89                "macos".to_string(),
90                "windows".to_string(),
91            ]),
92            requires_opt_in: Some(true),
93        },
94    ]
95}
96
97pub async fn resolve_sources(config: &AppConfig) -> Result<Vec<SourceDef>> {
98    let mut merged = builtin_sources();
99
100    if config.remote_registry.enabled {
101        if let Ok(remote) = load_remote_registry(config).await {
102            merged.extend(remote.sources);
103        }
104    }
105
106    if let Some(local) = load_local_sources(config)? {
107        merged.extend(local.sources);
108    }
109
110    let merged = merge_with_override(merged);
111    let mut valid = Vec::new();
112    for source in merged {
113        match validate_source(&source) {
114            Ok(()) => valid.push(source),
115            Err(e) => warn!("skipping unsafe source {}: {e}", source.id),
116        }
117    }
118    Ok(valid)
119}
120
121fn merge_with_override(input: Vec<SourceDef>) -> Vec<SourceDef> {
122    let mut out = Vec::new();
123    let mut seen = HashSet::new();
124    for source in input.into_iter().rev() {
125        if seen.insert(source.id.clone()) {
126            out.push(source);
127        }
128    }
129    out.reverse();
130    out
131}
132
133pub fn load_local_sources(config: &AppConfig) -> Result<Option<SourceManifest>> {
134    let p = config
135        .sources_path
136        .clone()
137        .unwrap_or(default_sources_path()?);
138    if !p.exists() {
139        return Ok(None);
140    }
141    let text = fs::read_to_string(&p)
142        .with_context(|| format!("failed reading sources file {}", p.display()))?;
143    let manifest = toml::from_str::<SourceManifest>(&text).context("invalid local sources.toml")?;
144    validate_manifest(&manifest)?;
145    Ok(Some(manifest))
146}
147
148pub fn add_local_source(config: &AppConfig, source: SourceDef) -> Result<PathBuf> {
149    validate_source(&source)?;
150    let path = config
151        .sources_path
152        .clone()
153        .unwrap_or(default_sources_path()?);
154    if let Some(parent) = path.parent() {
155        fs::create_dir_all(parent)?;
156    }
157
158    let mut manifest = if path.exists() {
159        let text = fs::read_to_string(&path)
160            .with_context(|| format!("failed reading sources file {}", path.display()))?;
161        toml::from_str::<SourceManifest>(&text).context("invalid local sources.toml")?
162    } else {
163        SourceManifest {
164            version: Some(1),
165            sources: Vec::new(),
166        }
167    };
168
169    if let Some(existing) = manifest.sources.iter_mut().find(|s| s.id == source.id) {
170        *existing = source;
171    } else {
172        manifest.sources.push(source);
173    }
174
175    manifest.sources.sort_by(|a, b| a.id.cmp(&b.id));
176    let text = toml::to_string_pretty(&manifest)?;
177    fs::write(&path, text)?;
178    Ok(path)
179}
180
181pub async fn load_remote_registry(config: &AppConfig) -> Result<SourceManifest> {
182    let url = config
183        .remote_registry
184        .url
185        .clone()
186        .context("remote registry url missing")?;
187
188    let cache_path = data_dir()?.join("registry-cache.json");
189    let cached = read_cache(&cache_path).ok();
190    let ttl_hours = config.remote_registry.cache_ttl_hours.max(1);
191
192    if let Some(cached) = &cached {
193        let age = Utc::now() - cached.fetched_at;
194        if age.num_hours() < ttl_hours as i64 {
195            return Ok(cached.manifest.clone());
196        }
197    }
198
199    let client = reqwest::Client::new();
200    let mut req = client.get(&url);
201    if let Some(cached) = &cached {
202        if let Some(etag) = &cached.etag {
203            req = req.header(reqwest::header::IF_NONE_MATCH, etag);
204        }
205    }
206
207    let resp = req.send().await?;
208    if resp.status() == reqwest::StatusCode::NOT_MODIFIED {
209        if let Some(cached) = cached {
210            return Ok(cached.manifest);
211        }
212    }
213
214    let status = resp.status();
215    let etag = resp
216        .headers()
217        .get(reqwest::header::ETAG)
218        .and_then(|v| v.to_str().ok())
219        .map(str::to_string);
220
221    if !status.is_success() {
222        if let Some(cached) = cached {
223            return Ok(cached.manifest);
224        }
225        anyhow::bail!("remote registry fetch failed: {status}");
226    }
227
228    let body = resp.text().await?;
229    let manifest = toml::from_str::<SourceManifest>(&body).context("invalid remote manifest")?;
230    validate_manifest(&manifest)?;
231    let snapshot = CachedRegistry {
232        fetched_at: Utc::now(),
233        etag,
234        manifest: manifest.clone(),
235    };
236    fs::write(cache_path, serde_json::to_vec_pretty(&snapshot)?)?;
237    Ok(manifest)
238}
239
240fn read_cache(path: &Path) -> Result<CachedRegistry> {
241    let bytes = fs::read(path)?;
242    Ok(serde_json::from_slice(&bytes)?)
243}
244
245pub fn discover_files(source: &SourceDef) -> Result<Vec<PathBuf>> {
246    validate_source(source)?;
247    let mut files = Vec::new();
248    let max_files = 5000usize;
249    let max_file_bytes = 20 * 1024 * 1024u64;
250
251    for root in &source.roots {
252        let expanded = expand_tilde(root);
253        if !is_root_allowlisted(&expanded) {
254            continue;
255        }
256        if !expanded.exists() {
257            continue;
258        }
259
260        for entry in ignore::WalkBuilder::new(&expanded)
261            .hidden(false)
262            .git_ignore(false)
263            .build()
264        {
265            let entry = match entry {
266                Ok(e) => e,
267                Err(_) => continue,
268            };
269
270            if !entry.file_type().map(|f| f.is_file()).unwrap_or(false) {
271                continue;
272            }
273
274            let relative = entry.path().strip_prefix(&expanded).unwrap_or(entry.path());
275            if matches_any_glob(relative, &source.globs) {
276                if let Ok(md) = entry.metadata() {
277                    if md.len() > max_file_bytes {
278                        continue;
279                    }
280                }
281                files.push(entry.path().to_path_buf());
282                if files.len() >= max_files {
283                    break;
284                }
285            }
286        }
287        if files.len() >= max_files {
288            break;
289        }
290    }
291
292    files.sort();
293    files.dedup();
294    Ok(files)
295}
296
297fn matches_any_glob(path: &Path, globs: &[String]) -> bool {
298    let path_text = path.to_string_lossy();
299    globs
300        .iter()
301        .filter_map(|g| Pattern::new(g).ok())
302        .any(|p| p.matches(&path_text))
303}
304
305pub fn validate_manifest(manifest: &SourceManifest) -> Result<()> {
306    if manifest.sources.is_empty() {
307        anyhow::bail!("manifest must contain at least one source");
308    }
309    if let Some(version) = manifest.version {
310        if version != 1 {
311            anyhow::bail!("unsupported manifest version: {version} (expected 1)");
312        }
313    }
314
315    let mut counts: HashMap<&str, usize> = HashMap::new();
316    for source in &manifest.sources {
317        *counts.entry(source.id.as_str()).or_insert(0) += 1;
318        validate_source(source)?;
319    }
320
321    let duplicates = counts
322        .into_iter()
323        .filter_map(|(id, n)| if n > 1 { Some(id.to_string()) } else { None })
324        .collect::<Vec<_>>();
325    if !duplicates.is_empty() {
326        anyhow::bail!(
327            "duplicate source ids in manifest: {}",
328            duplicates.join(", ")
329        );
330    }
331
332    Ok(())
333}
334
335pub fn validate_source(source: &SourceDef) -> Result<()> {
336    if source.id.trim().is_empty() {
337        anyhow::bail!("source id cannot be empty");
338    }
339    let valid_id = source
340        .id
341        .chars()
342        .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-' || c == '.');
343    if !valid_id {
344        anyhow::bail!("source id contains invalid characters");
345    }
346
347    if source.roots.is_empty() {
348        anyhow::bail!("source must declare at least one root");
349    }
350    if source.globs.is_empty() {
351        anyhow::bail!("source must declare at least one glob");
352    }
353    if source.format.trim().is_empty() {
354        anyhow::bail!("source format cannot be empty");
355    }
356    if !matches!(source.format.as_str(), "jsonl" | "json" | "mixed") {
357        anyhow::bail!("source format must be one of: jsonl, json, mixed");
358    }
359
360    for root in &source.roots {
361        let expanded = expand_tilde(root);
362        if has_parent_traversal(&expanded) {
363            anyhow::bail!("root has path traversal segments");
364        }
365        if !is_root_allowlisted(&expanded) {
366            anyhow::bail!(
367                "root is outside allowlisted user locations: {}",
368                expanded.display()
369            );
370        }
371    }
372
373    for g in &source.globs {
374        Pattern::new(g).with_context(|| format!("invalid glob pattern: {g}"))?;
375        if g.contains("..") {
376            anyhow::bail!("glob cannot include parent traversal");
377        }
378    }
379
380    Ok(())
381}
382
383fn has_parent_traversal(path: &Path) -> bool {
384    path.components().any(|c| matches!(c, Component::ParentDir))
385}
386
387fn is_root_allowlisted(path: &Path) -> bool {
388    if let Ok(home) = env::var("HOME") {
389        let home = PathBuf::from(home);
390        if path.starts_with(&home) {
391            return true;
392        }
393    }
394    if let Ok(appdata) = env::var("APPDATA") {
395        let appdata = PathBuf::from(appdata);
396        if path.starts_with(&appdata) {
397            return true;
398        }
399    }
400    false
401}
402
403fn expand_tilde(input: &str) -> PathBuf {
404    if input == "~" || input.starts_with("~/") {
405        if let Ok(home) = env::var("HOME") {
406            return PathBuf::from(input.replacen('~', &home, 1));
407        }
408    }
409    PathBuf::from(input)
410}
411
412#[cfg(test)]
413mod tests {
414    use std::time::{SystemTime, UNIX_EPOCH};
415
416    use crate::config::AppConfig;
417
418    use super::{
419        SourceDef, SourceManifest, add_local_source, load_local_sources, validate_manifest,
420        validate_source,
421    };
422
423    #[test]
424    fn add_local_source_persists_manifest() {
425        let mut cfg = AppConfig::default();
426        let nonce = SystemTime::now()
427            .duration_since(UNIX_EPOCH)
428            .expect("clock drift")
429            .as_nanos();
430        let test_path = std::env::temp_dir().join(format!("trace-share-sources-{nonce}.toml"));
431        cfg.sources_path = Some(test_path.clone());
432
433        let src = SourceDef {
434            id: "demo_source".to_string(),
435            display_name: Some("Demo".to_string()),
436            roots: vec!["~/demo".to_string()],
437            globs: vec!["**/*.jsonl".to_string()],
438            format: "jsonl".to_string(),
439            parser_hint: Some("generic".to_string()),
440            platforms: None,
441            requires_opt_in: Some(true),
442        };
443
444        add_local_source(&cfg, src).expect("add source");
445        let loaded = load_local_sources(&cfg)
446            .expect("load manifest")
447            .expect("manifest exists");
448        assert!(loaded.sources.iter().any(|s| s.id == "demo_source"));
449
450        let _ = std::fs::remove_file(test_path);
451    }
452
453    #[test]
454    fn rejects_invalid_source_id() {
455        let src = SourceDef {
456            id: "bad id".to_string(),
457            display_name: None,
458            roots: vec!["~/demo".to_string()],
459            globs: vec!["**/*.jsonl".to_string()],
460            format: "jsonl".to_string(),
461            parser_hint: None,
462            platforms: None,
463            requires_opt_in: None,
464        };
465        assert!(validate_source(&src).is_err());
466    }
467
468    #[test]
469    fn rejects_duplicate_source_ids_in_manifest() {
470        let source = SourceDef {
471            id: "dup_source".to_string(),
472            display_name: None,
473            roots: vec!["~/.codex/sessions".to_string()],
474            globs: vec!["**/*.jsonl".to_string()],
475            format: "jsonl".to_string(),
476            parser_hint: None,
477            platforms: None,
478            requires_opt_in: Some(false),
479        };
480        let manifest = SourceManifest {
481            version: Some(1),
482            sources: vec![source.clone(), source],
483        };
484        assert!(validate_manifest(&manifest).is_err());
485    }
486}