use std::path::{Path, PathBuf};
use anyhow::{anyhow, Context, Result};
use serde_json::json;
use sha1::{Digest, Sha1};
use crate::config::FilesSourceConfig;
use crate::sources::base::Document;
pub struct FilesSource {
cfg: FilesSourceConfig,
}
impl FilesSource {
pub fn new(cfg: FilesSourceConfig) -> Self {
Self { cfg }
}
pub fn iter_documents(&self) -> Result<Vec<Document>> {
let mut paths: Vec<PathBuf> = glob::glob(&self.cfg.glob)
.with_context(|| format!("invalid glob {:?}", self.cfg.glob))?
.filter_map(std::result::Result::ok)
.collect();
if paths.is_empty() {
return Err(anyhow!("no files matched glob: {}", self.cfg.glob));
}
paths.sort();
let mut out = Vec::with_capacity(paths.len());
for p in paths {
let text =
std::fs::read_to_string(&p).with_context(|| format!("reading {}", p.display()))?;
let doc_id = self.id_for(&p)?;
let title = p
.file_name()
.and_then(|s| s.to_str())
.map(|s| s.to_string());
out.push(Document {
id: doc_id,
content: text,
title,
metadata: json!({ "source_path": p.display().to_string() }),
fingerprint: None,
});
}
Ok(out)
}
fn id_for(&self, path: &Path) -> Result<String> {
match self.cfg.id_from.as_str() {
"path" => Ok(path.display().to_string()),
"stem" => path
.file_stem()
.and_then(|s| s.to_str())
.map(|s| s.to_string())
.ok_or_else(|| anyhow!("file has no stem: {}", path.display())),
"sha1" => {
let mut hasher = Sha1::new();
hasher.update(path.display().to_string().as_bytes());
Ok(format!("{:x}", hasher.finalize()))
}
other => Err(anyhow!("unknown id_from: {other}")),
}
}
}