Skip to main content

chunkshop/sources/
files.rs

1//! Files source. Mirrors `python/src/chunkshop/sources/files.py`.
2
3use std::path::{Path, PathBuf};
4
5use anyhow::{anyhow, Context, Result};
6use serde_json::json;
7use sha1::{Digest, Sha1};
8
9use crate::config::FilesSourceConfig;
10use crate::sources::base::Document;
11
12pub struct FilesSource {
13    cfg: FilesSourceConfig,
14}
15
16impl FilesSource {
17    pub fn new(cfg: FilesSourceConfig) -> Self {
18        Self { cfg }
19    }
20
21    /// Enumerate files matching the glob, in sorted order, reading each as text.
22    pub fn iter_documents(&self) -> Result<Vec<Document>> {
23        let mut paths: Vec<PathBuf> = glob::glob(&self.cfg.glob)
24            .with_context(|| format!("invalid glob {:?}", self.cfg.glob))?
25            .filter_map(std::result::Result::ok)
26            .collect();
27        if paths.is_empty() {
28            return Err(anyhow!("no files matched glob: {}", self.cfg.glob));
29        }
30        paths.sort();
31
32        let mut out = Vec::with_capacity(paths.len());
33        for p in paths {
34            let text =
35                std::fs::read_to_string(&p).with_context(|| format!("reading {}", p.display()))?;
36            let doc_id = self.id_for(&p)?;
37            let title = p
38                .file_name()
39                .and_then(|s| s.to_str())
40                .map(|s| s.to_string());
41            out.push(Document {
42                id: doc_id,
43                content: text,
44                title,
45                metadata: json!({ "source_path": p.display().to_string() }),
46                fingerprint: None,
47            });
48        }
49        Ok(out)
50    }
51
52    fn id_for(&self, path: &Path) -> Result<String> {
53        match self.cfg.id_from.as_str() {
54            "path" => Ok(path.display().to_string()),
55            "stem" => path
56                .file_stem()
57                .and_then(|s| s.to_str())
58                .map(|s| s.to_string())
59                .ok_or_else(|| anyhow!("file has no stem: {}", path.display())),
60            "sha1" => {
61                let mut hasher = Sha1::new();
62                hasher.update(path.display().to_string().as_bytes());
63                Ok(format!("{:x}", hasher.finalize()))
64            }
65            other => Err(anyhow!("unknown id_from: {other}")),
66        }
67    }
68}