chunkshop/sources/
files.rs1use std::path::{Path, PathBuf};
4
5use anyhow::{anyhow, Context, Result};
6use serde_json::json;
7use sha1::{Digest, Sha1};
8
9use crate::config::FilesSourceConfig;
10use crate::sources::base::Document;
11
12pub struct FilesSource {
13 cfg: FilesSourceConfig,
14}
15
16impl FilesSource {
17 pub fn new(cfg: FilesSourceConfig) -> Self {
18 Self { cfg }
19 }
20
21 pub fn iter_documents(&self) -> Result<Vec<Document>> {
23 let mut paths: Vec<PathBuf> = glob::glob(&self.cfg.glob)
24 .with_context(|| format!("invalid glob {:?}", self.cfg.glob))?
25 .filter_map(std::result::Result::ok)
26 .collect();
27 if paths.is_empty() {
28 return Err(anyhow!("no files matched glob: {}", self.cfg.glob));
29 }
30 paths.sort();
31
32 let mut out = Vec::with_capacity(paths.len());
33 for p in paths {
34 let text =
35 std::fs::read_to_string(&p).with_context(|| format!("reading {}", p.display()))?;
36 let doc_id = self.id_for(&p)?;
37 let title = p
38 .file_name()
39 .and_then(|s| s.to_str())
40 .map(|s| s.to_string());
41 out.push(Document {
42 id: doc_id,
43 content: text,
44 title,
45 metadata: json!({ "source_path": p.display().to_string() }),
46 fingerprint: None,
47 });
48 }
49 Ok(out)
50 }
51
52 fn id_for(&self, path: &Path) -> Result<String> {
53 match self.cfg.id_from.as_str() {
54 "path" => Ok(path.display().to_string()),
55 "stem" => path
56 .file_stem()
57 .and_then(|s| s.to_str())
58 .map(|s| s.to_string())
59 .ok_or_else(|| anyhow!("file has no stem: {}", path.display())),
60 "sha1" => {
61 let mut hasher = Sha1::new();
62 hasher.update(path.display().to_string().as_bytes());
63 Ok(format!("{:x}", hasher.finalize()))
64 }
65 other => Err(anyhow!("unknown id_from: {other}")),
66 }
67 }
68}