Skip to main content

synaptic_loaders/
csv_loader.rs

1use std::collections::HashMap;
2
3use crate::Document;
4use async_trait::async_trait;
5use serde_json::Value;
6use synaptic_core::SynapticError;
7
8use crate::Loader;
9
10/// Loads documents from CSV data. Each row becomes a Document.
11///
12/// The `content_column` specifies which column to use as document content.
13/// If not set, all columns are concatenated.
14/// An `id_column` can optionally specify the column for document IDs.
15pub struct CsvLoader {
16    data: String,
17    content_column: Option<String>,
18    id_column: Option<String>,
19}
20
21impl CsvLoader {
22    pub fn new(data: impl Into<String>) -> Self {
23        Self {
24            data: data.into(),
25            content_column: None,
26            id_column: None,
27        }
28    }
29
30    pub fn with_content_column(mut self, column: impl Into<String>) -> Self {
31        self.content_column = Some(column.into());
32        self
33    }
34
35    pub fn with_id_column(mut self, column: impl Into<String>) -> Self {
36        self.id_column = Some(column.into());
37        self
38    }
39}
40
41#[async_trait]
42impl Loader for CsvLoader {
43    async fn load(&self) -> Result<Vec<Document>, SynapticError> {
44        let mut reader = csv::Reader::from_reader(self.data.as_bytes());
45        let headers = reader
46            .headers()
47            .map_err(|e| SynapticError::Loader(format!("CSV header error: {e}")))?
48            .clone();
49
50        let mut docs = Vec::new();
51
52        for (i, result) in reader.records().enumerate() {
53            let record =
54                result.map_err(|e| SynapticError::Loader(format!("CSV row {i} error: {e}")))?;
55
56            let id = if let Some(id_col) = &self.id_column {
57                let idx = headers
58                    .iter()
59                    .position(|h| h == id_col.as_str())
60                    .ok_or_else(|| {
61                        SynapticError::Loader(format!("id column '{id_col}' not found"))
62                    })?;
63                record.get(idx).unwrap_or("").to_string()
64            } else {
65                format!("row-{i}")
66            };
67
68            let content = if let Some(content_col) = &self.content_column {
69                let idx = headers
70                    .iter()
71                    .position(|h| h == content_col.as_str())
72                    .ok_or_else(|| {
73                        SynapticError::Loader(format!("content column '{content_col}' not found"))
74                    })?;
75                record.get(idx).unwrap_or("").to_string()
76            } else {
77                // Concatenate all columns
78                record.iter().collect::<Vec<&str>>().join(" ")
79            };
80
81            // Store all columns as metadata
82            let mut metadata = HashMap::new();
83            for (j, header) in headers.iter().enumerate() {
84                if let Some(value) = record.get(j) {
85                    metadata.insert(header.to_string(), Value::String(value.to_string()));
86                }
87            }
88
89            docs.push(Document::with_metadata(id, content, metadata));
90        }
91
92        Ok(docs)
93    }
94}