Skip to main content

trace_share_core/
sources.rs

1use anyhow::{Context, Result};
2use chrono::{DateTime, Utc};
3use glob::Pattern;
4use serde::{Deserialize, Serialize};
5use std::{
6    collections::HashMap,
7    collections::HashSet,
8    env, fs,
9    path::{Component, Path, PathBuf},
10};
11use tracing::warn;
12
13use crate::config::{AppConfig, data_dir, default_sources_path};
14use crate::security::{ensure_secure_url, write_private_file};
15
16#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct SourceDef {
18    pub id: String,
19    pub display_name: Option<String>,
20    pub roots: Vec<String>,
21    pub globs: Vec<String>,
22    pub format: String,
23    pub parser_hint: Option<String>,
24    pub platforms: Option<Vec<String>>,
25    pub requires_opt_in: Option<bool>,
26}
27
28#[derive(Debug, Clone, Default, Serialize, Deserialize)]
29pub struct SourceManifest {
30    pub version: Option<u32>,
31    pub sources: Vec<SourceDef>,
32}
33
34#[derive(Debug, Clone, Serialize, Deserialize)]
35struct CachedRegistry {
36    fetched_at: DateTime<Utc>,
37    etag: Option<String>,
38    manifest: SourceManifest,
39}
40
41pub fn builtin_sources() -> Vec<SourceDef> {
42    vec![
43        SourceDef {
44            id: "codex_cli".to_string(),
45            display_name: Some("Codex CLI".to_string()),
46            roots: vec!["~/.codex/sessions".to_string()],
47            globs: vec!["**/*".to_string()],
48            format: "jsonl".to_string(),
49            parser_hint: Some("codex_cli_v1".to_string()),
50            platforms: None,
51            requires_opt_in: Some(false),
52        },
53        SourceDef {
54            id: "claude_code".to_string(),
55            display_name: Some("Claude Code".to_string()),
56            roots: vec!["~/.claude/projects".to_string()],
57            globs: vec!["**/sessions/*.jsonl".to_string()],
58            format: "jsonl".to_string(),
59            parser_hint: Some("claude_code_v1".to_string()),
60            platforms: None,
61            requires_opt_in: Some(false),
62        },
63        SourceDef {
64            id: "vscode_global_storage".to_string(),
65            display_name: Some("VS Code Global Storage".to_string()),
66            roots: vec![
67                "~/.config/Code/User/globalStorage".to_string(),
68                "~/Library/Application Support/Code/User/globalStorage".to_string(),
69                "~/AppData/Roaming/Code/User/globalStorage".to_string(),
70            ],
71            globs: vec!["**/*.jsonl".to_string(), "**/*.json".to_string()],
72            format: "jsonl".to_string(),
73            parser_hint: Some("vscode_storage_v1".to_string()),
74            platforms: None,
75            requires_opt_in: Some(false),
76        },
77        SourceDef {
78            id: "tandem_sessions".to_string(),
79            display_name: Some("Tandem Sessions".to_string()),
80            roots: vec![
81                "~/.local/share/tandem/data/storage".to_string(),
82                "~/Library/Application Support/tandem/data/storage".to_string(),
83                "~/AppData/Roaming/tandem/data/storage".to_string(),
84            ],
85            globs: vec!["**/sessions.json".to_string()],
86            format: "json".to_string(),
87            parser_hint: Some("tandem_v1".to_string()),
88            platforms: Some(vec![
89                "linux".to_string(),
90                "macos".to_string(),
91                "windows".to_string(),
92            ]),
93            requires_opt_in: Some(true),
94        },
95    ]
96}
97
98pub async fn resolve_sources(config: &AppConfig) -> Result<Vec<SourceDef>> {
99    let mut merged = builtin_sources();
100
101    if config.remote_registry.enabled {
102        if let Ok(remote) = load_remote_registry(config).await {
103            merged.extend(remote.sources);
104        }
105    }
106
107    if let Some(local) = load_local_sources(config)? {
108        merged.extend(local.sources);
109    }
110
111    let merged = merge_with_override(merged);
112    let mut valid = Vec::new();
113    for source in merged {
114        match validate_source(&source) {
115            Ok(()) => valid.push(source),
116            Err(e) => warn!("skipping unsafe source {}: {e}", source.id),
117        }
118    }
119    Ok(valid)
120}
121
122fn merge_with_override(input: Vec<SourceDef>) -> Vec<SourceDef> {
123    let mut out = Vec::new();
124    let mut seen = HashSet::new();
125    for source in input.into_iter().rev() {
126        if seen.insert(source.id.clone()) {
127            out.push(source);
128        }
129    }
130    out.reverse();
131    out
132}
133
134pub fn load_local_sources(config: &AppConfig) -> Result<Option<SourceManifest>> {
135    let p = config
136        .sources_path
137        .clone()
138        .unwrap_or(default_sources_path()?);
139    if !p.exists() {
140        return Ok(None);
141    }
142    let text = fs::read_to_string(&p)
143        .with_context(|| format!("failed reading sources file {}", p.display()))?;
144    let manifest = toml::from_str::<SourceManifest>(&text).context("invalid local sources.toml")?;
145    validate_manifest(&manifest)?;
146    Ok(Some(manifest))
147}
148
149pub fn add_local_source(config: &AppConfig, source: SourceDef) -> Result<PathBuf> {
150    validate_source(&source)?;
151    let path = config
152        .sources_path
153        .clone()
154        .unwrap_or(default_sources_path()?);
155    if let Some(parent) = path.parent() {
156        fs::create_dir_all(parent)?;
157    }
158
159    let mut manifest = if path.exists() {
160        let text = fs::read_to_string(&path)
161            .with_context(|| format!("failed reading sources file {}", path.display()))?;
162        toml::from_str::<SourceManifest>(&text).context("invalid local sources.toml")?
163    } else {
164        SourceManifest {
165            version: Some(1),
166            sources: Vec::new(),
167        }
168    };
169
170    if let Some(existing) = manifest.sources.iter_mut().find(|s| s.id == source.id) {
171        *existing = source;
172    } else {
173        manifest.sources.push(source);
174    }
175
176    manifest.sources.sort_by(|a, b| a.id.cmp(&b.id));
177    let text = toml::to_string_pretty(&manifest)?;
178    write_private_file(&path, text.as_bytes())?;
179    Ok(path)
180}
181
182pub async fn load_remote_registry(config: &AppConfig) -> Result<SourceManifest> {
183    let url = config
184        .remote_registry
185        .url
186        .clone()
187        .context("remote registry url missing")?;
188    ensure_secure_url(&url, "remote registry URL")?;
189
190    let cache_path = data_dir()?.join("registry-cache.json");
191    let cached = read_cache(&cache_path).ok();
192    let ttl_hours = config.remote_registry.cache_ttl_hours.max(1);
193
194    if let Some(cached) = &cached {
195        let age = Utc::now() - cached.fetched_at;
196        if age.num_hours() < ttl_hours as i64 {
197            return Ok(cached.manifest.clone());
198        }
199    }
200
201    let client = reqwest::Client::new();
202    let mut req = client.get(&url);
203    if let Some(cached) = &cached {
204        if let Some(etag) = &cached.etag {
205            req = req.header(reqwest::header::IF_NONE_MATCH, etag);
206        }
207    }
208
209    let resp = req.send().await?;
210    if resp.status() == reqwest::StatusCode::NOT_MODIFIED {
211        if let Some(cached) = cached {
212            return Ok(cached.manifest);
213        }
214    }
215
216    let status = resp.status();
217    let etag = resp
218        .headers()
219        .get(reqwest::header::ETAG)
220        .and_then(|v| v.to_str().ok())
221        .map(str::to_string);
222
223    if !status.is_success() {
224        if let Some(cached) = cached {
225            return Ok(cached.manifest);
226        }
227        anyhow::bail!("remote registry fetch failed: {status}");
228    }
229
230    let body = resp.text().await?;
231    let manifest = toml::from_str::<SourceManifest>(&body).context("invalid remote manifest")?;
232    validate_manifest(&manifest)?;
233    let snapshot = CachedRegistry {
234        fetched_at: Utc::now(),
235        etag,
236        manifest: manifest.clone(),
237    };
238    let bytes = serde_json::to_vec_pretty(&snapshot)?;
239    write_private_file(&cache_path, &bytes)?;
240    Ok(manifest)
241}
242
243fn read_cache(path: &Path) -> Result<CachedRegistry> {
244    let bytes = fs::read(path)?;
245    Ok(serde_json::from_slice(&bytes)?)
246}
247
248pub fn discover_files(source: &SourceDef) -> Result<Vec<PathBuf>> {
249    validate_source(source)?;
250    let mut files = Vec::new();
251    let max_files = 5000usize;
252    let max_file_bytes = 20 * 1024 * 1024u64;
253
254    for root in &source.roots {
255        let expanded = expand_tilde(root);
256        if !is_root_allowlisted(&expanded) {
257            continue;
258        }
259        if !expanded.exists() {
260            continue;
261        }
262
263        for entry in ignore::WalkBuilder::new(&expanded)
264            .hidden(false)
265            .git_ignore(false)
266            .build()
267        {
268            let entry = match entry {
269                Ok(e) => e,
270                Err(_) => continue,
271            };
272
273            if !entry.file_type().map(|f| f.is_file()).unwrap_or(false) {
274                continue;
275            }
276
277            let relative = entry.path().strip_prefix(&expanded).unwrap_or(entry.path());
278            if matches_any_glob(relative, &source.globs) {
279                if let Ok(md) = entry.metadata() {
280                    if md.len() > max_file_bytes {
281                        continue;
282                    }
283                }
284                files.push(entry.path().to_path_buf());
285                if files.len() >= max_files {
286                    break;
287                }
288            }
289        }
290        if files.len() >= max_files {
291            break;
292        }
293    }
294
295    files.sort();
296    files.dedup();
297    Ok(files)
298}
299
300fn matches_any_glob(path: &Path, globs: &[String]) -> bool {
301    let path_text = path.to_string_lossy();
302    globs
303        .iter()
304        .filter_map(|g| Pattern::new(g).ok())
305        .any(|p| p.matches(&path_text))
306}
307
308pub fn validate_manifest(manifest: &SourceManifest) -> Result<()> {
309    if manifest.sources.is_empty() {
310        anyhow::bail!("manifest must contain at least one source");
311    }
312    if let Some(version) = manifest.version {
313        if version != 1 {
314            anyhow::bail!("unsupported manifest version: {version} (expected 1)");
315        }
316    }
317
318    let mut counts: HashMap<&str, usize> = HashMap::new();
319    for source in &manifest.sources {
320        *counts.entry(source.id.as_str()).or_insert(0) += 1;
321        validate_source(source)?;
322    }
323
324    let duplicates = counts
325        .into_iter()
326        .filter_map(|(id, n)| if n > 1 { Some(id.to_string()) } else { None })
327        .collect::<Vec<_>>();
328    if !duplicates.is_empty() {
329        anyhow::bail!(
330            "duplicate source ids in manifest: {}",
331            duplicates.join(", ")
332        );
333    }
334
335    Ok(())
336}
337
338pub fn validate_source(source: &SourceDef) -> Result<()> {
339    if source.id.trim().is_empty() {
340        anyhow::bail!("source id cannot be empty");
341    }
342    let valid_id = source
343        .id
344        .chars()
345        .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-' || c == '.');
346    if !valid_id {
347        anyhow::bail!("source id contains invalid characters");
348    }
349
350    if source.roots.is_empty() {
351        anyhow::bail!("source must declare at least one root");
352    }
353    if source.globs.is_empty() {
354        anyhow::bail!("source must declare at least one glob");
355    }
356    if source.format.trim().is_empty() {
357        anyhow::bail!("source format cannot be empty");
358    }
359    if !matches!(source.format.as_str(), "jsonl" | "json" | "mixed") {
360        anyhow::bail!("source format must be one of: jsonl, json, mixed");
361    }
362
363    for root in &source.roots {
364        let expanded = expand_tilde(root);
365        if has_parent_traversal(&expanded) {
366            anyhow::bail!("root has path traversal segments");
367        }
368        if !is_root_allowlisted(&expanded) {
369            anyhow::bail!(
370                "root is outside allowlisted user locations: {}",
371                expanded.display()
372            );
373        }
374    }
375
376    for g in &source.globs {
377        Pattern::new(g).with_context(|| format!("invalid glob pattern: {g}"))?;
378        if g.contains("..") {
379            anyhow::bail!("glob cannot include parent traversal");
380        }
381    }
382
383    Ok(())
384}
385
386fn has_parent_traversal(path: &Path) -> bool {
387    path.components().any(|c| matches!(c, Component::ParentDir))
388}
389
390fn is_root_allowlisted(path: &Path) -> bool {
391    allowlisted_roots()
392        .into_iter()
393        .any(|root| path_starts_with(path, &root))
394}
395
396fn expand_tilde(input: &str) -> PathBuf {
397    if input == "~" || input.starts_with("~/") || input.starts_with("~\\") {
398        if let Some(home) = resolve_home_dir() {
399            return PathBuf::from(input.replacen('~', home.to_string_lossy().as_ref(), 1));
400        }
401    }
402    PathBuf::from(input)
403}
404
405fn allowlisted_roots() -> Vec<PathBuf> {
406    let mut roots = Vec::new();
407    if let Some(home) = resolve_home_dir() {
408        roots.push(home);
409    }
410    if let Ok(appdata) = env::var("APPDATA") {
411        roots.push(PathBuf::from(appdata));
412    }
413    if let Ok(local_appdata) = env::var("LOCALAPPDATA") {
414        roots.push(PathBuf::from(local_appdata));
415    }
416    roots
417}
418
419fn resolve_home_dir() -> Option<PathBuf> {
420    env::var("HOME")
421        .ok()
422        .map(PathBuf::from)
423        .or_else(|| env::var("USERPROFILE").ok().map(PathBuf::from))
424}
425
426fn path_starts_with(path: &Path, root: &Path) -> bool {
427    #[cfg(windows)]
428    {
429        let p = path.to_string_lossy().to_lowercase();
430        let r = root.to_string_lossy().to_lowercase();
431        return p == r
432            || p.strip_prefix(&(r.clone() + "\\")).is_some()
433            || p.strip_prefix(&(r + "/")).is_some();
434    }
435
436    #[cfg(not(windows))]
437    {
438        path.starts_with(root)
439    }
440}
441
442#[cfg(test)]
443mod tests {
444    use std::time::{SystemTime, UNIX_EPOCH};
445
446    use crate::config::AppConfig;
447
448    use super::{
449        SourceDef, SourceManifest, add_local_source, load_local_sources, validate_manifest,
450        validate_source,
451    };
452
453    #[test]
454    fn add_local_source_persists_manifest() {
455        let mut cfg = AppConfig::default();
456        let nonce = SystemTime::now()
457            .duration_since(UNIX_EPOCH)
458            .expect("clock drift")
459            .as_nanos();
460        let test_path = std::env::temp_dir().join(format!("trace-share-sources-{nonce}.toml"));
461        cfg.sources_path = Some(test_path.clone());
462
463        let src = SourceDef {
464            id: "demo_source".to_string(),
465            display_name: Some("Demo".to_string()),
466            roots: vec!["~/demo".to_string()],
467            globs: vec!["**/*.jsonl".to_string()],
468            format: "jsonl".to_string(),
469            parser_hint: Some("generic".to_string()),
470            platforms: None,
471            requires_opt_in: Some(true),
472        };
473
474        add_local_source(&cfg, src).expect("add source");
475        let loaded = load_local_sources(&cfg)
476            .expect("load manifest")
477            .expect("manifest exists");
478        assert!(loaded.sources.iter().any(|s| s.id == "demo_source"));
479
480        let _ = std::fs::remove_file(test_path);
481    }
482
483    #[test]
484    fn rejects_invalid_source_id() {
485        let src = SourceDef {
486            id: "bad id".to_string(),
487            display_name: None,
488            roots: vec!["~/demo".to_string()],
489            globs: vec!["**/*.jsonl".to_string()],
490            format: "jsonl".to_string(),
491            parser_hint: None,
492            platforms: None,
493            requires_opt_in: None,
494        };
495        assert!(validate_source(&src).is_err());
496    }
497
498    #[test]
499    fn rejects_duplicate_source_ids_in_manifest() {
500        let source = SourceDef {
501            id: "dup_source".to_string(),
502            display_name: None,
503            roots: vec!["~/.codex/sessions".to_string()],
504            globs: vec!["**/*.jsonl".to_string()],
505            format: "jsonl".to_string(),
506            parser_hint: None,
507            platforms: None,
508            requires_opt_in: Some(false),
509        };
510        let manifest = SourceManifest {
511            version: Some(1),
512            sources: vec![source.clone(), source],
513        };
514        assert!(validate_manifest(&manifest).is_err());
515    }
516}