use std::collections::HashMap;
use crate::Document;
use async_trait::async_trait;
use serde_json::Value;
use synaptic_core::SynapticError;
use crate::Loader;
pub struct CsvLoader {
data: String,
content_column: Option<String>,
id_column: Option<String>,
}
impl CsvLoader {
pub fn new(data: impl Into<String>) -> Self {
Self {
data: data.into(),
content_column: None,
id_column: None,
}
}
pub fn with_content_column(mut self, column: impl Into<String>) -> Self {
self.content_column = Some(column.into());
self
}
pub fn with_id_column(mut self, column: impl Into<String>) -> Self {
self.id_column = Some(column.into());
self
}
}
#[async_trait]
impl Loader for CsvLoader {
async fn load(&self) -> Result<Vec<Document>, SynapticError> {
let mut reader = csv::Reader::from_reader(self.data.as_bytes());
let headers = reader
.headers()
.map_err(|e| SynapticError::Loader(format!("CSV header error: {e}")))?
.clone();
let mut docs = Vec::new();
for (i, result) in reader.records().enumerate() {
let record =
result.map_err(|e| SynapticError::Loader(format!("CSV row {i} error: {e}")))?;
let id = if let Some(id_col) = &self.id_column {
let idx = headers
.iter()
.position(|h| h == id_col.as_str())
.ok_or_else(|| {
SynapticError::Loader(format!("id column '{id_col}' not found"))
})?;
record.get(idx).unwrap_or("").to_string()
} else {
format!("row-{i}")
};
let content = if let Some(content_col) = &self.content_column {
let idx = headers
.iter()
.position(|h| h == content_col.as_str())
.ok_or_else(|| {
SynapticError::Loader(format!("content column '{content_col}' not found"))
})?;
record.get(idx).unwrap_or("").to_string()
} else {
record.iter().collect::<Vec<&str>>().join(" ")
};
let mut metadata = HashMap::new();
for (j, header) in headers.iter().enumerate() {
if let Some(value) = record.get(j) {
metadata.insert(header.to_string(), Value::String(value.to_string()));
}
}
docs.push(Document::with_metadata(id, content, metadata));
}
Ok(docs)
}
}