Skip to main content

rust_scraper/domain/
exporter.rs

1//! Exporter trait and configuration for RAG pipeline
2//!
3//! Defines the interface for exporting scraped content to various formats
4//! suitable for retrieval-augmented generation systems.
5
6use std::path::PathBuf;
7
8use serde::{Deserialize, Serialize};
9use thiserror::Error;
10
11use crate::domain::entities::{DocumentChunk, ExportFormat};
12
13/// Errors that can occur during export operations
14#[derive(Error, Debug)]
15pub enum ExporterError {
16    /// Failed to create output directory
17    #[error("No se pudo crear el directorio de salida: {0}")]
18    DirectoryCreation(#[from] std::io::Error),
19
20    /// Failed to open or write to file
21    #[error("Error de escritura: {0}")]
22    WriteError(String),
23
24    /// Invalid configuration
25    #[error("Configuración inválida: {0}")]
26    InvalidConfig(String),
27
28    /// Serialization failed
29    #[error("Error de serialización: {0}")]
30    Serialization(#[from] serde_json::Error),
31
32    /// Batch operation failed (partial success)
33    #[error("Error en batch: {0}")]
34    BatchError(String),
35
36    /// State store operation failed
37    #[error("Error en state store: {0}")]
38    StateStore(#[from] crate::error::ScraperError),
39}
40
41/// Result type for exporter operations
42pub type ExportResult<T> = std::result::Result<T, ExporterError>;
43
44/// Configuration for exporter instances
45///
46/// Contains all settings needed to configure an exporter for a specific format
47/// and output location.
48#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct ExporterConfig {
50    /// Output directory where files will be written
51    pub output_dir: PathBuf,
52    /// Export format to use
53    pub format: ExportFormat,
54    /// Base filename (without extension)
55    pub filename: String,
56    /// Whether to append to existing files or overwrite
57    pub append: bool,
58    /// Optional batch size for batch operations
59    pub batch_size: Option<usize>,
60}
61
62impl ExporterConfig {
63    /// Create a new ExporterConfig with required fields
64    ///
65    /// # Errors
66    /// Returns InvalidConfig if output_dir is not a valid directory path
67    pub fn new(output_dir: PathBuf, format: ExportFormat, filename: impl Into<String>) -> Self {
68        Self {
69            output_dir,
70            format,
71            filename: filename.into(),
72            append: false,
73            batch_size: None,
74        }
75    }
76
77    /// Set append mode
78    #[must_use]
79    pub fn with_append(mut self, append: bool) -> Self {
80        self.append = append;
81        self
82    }
83
84    /// Set batch size
85    #[must_use]
86    pub fn with_batch_size(mut self, size: usize) -> Self {
87        self.batch_size = Some(size);
88        self
89    }
90
91    /// Get the full output file path
92    #[must_use]
93    pub fn output_path(&self) -> PathBuf {
94        let ext = self.format.extension();
95        self.output_dir.join(format!("{}.{}", self.filename, ext))
96    }
97
98    /// Get the state file path for this configuration
99    #[must_use]
100    pub fn state_path(&self) -> PathBuf {
101        let state_dir = self.output_dir.join("state");
102        // Extract domain from filename if possible, otherwise use filename
103        let domain = self.filename.clone();
104        state_dir.join(format!("{}.json", domain))
105    }
106}
107
108/// Default implementation for ExporterConfig
109impl Default for ExporterConfig {
110    fn default() -> Self {
111        Self {
112            output_dir: PathBuf::from("./output"),
113            format: ExportFormat::Jsonl,
114            filename: "export".to_string(),
115            append: false,
116            batch_size: None,
117        }
118    }
119}
120
121/// Trait for exporting document chunks to various formats
122///
123/// Implementors must provide:
124/// - Synchronous export (export method)
125/// - Batch export (export_batch method)
126///
127/// The trait is designed to be:
128/// - `Sync`: Safe to share across threads
129/// - `'static`: No lifetime dependencies on caller
130///
131/// # Example
132/// ```ignore
133/// struct JsonlExporter {
134///     config: ExporterConfig,
135/// }
136///
137/// impl Exporter for JsonlExporter {
138///     fn export(&self, documents: Vec<DocumentChunk>) -> ExportResult<()> { ... }
139///     fn export_batch(&self, documents: Vec<DocumentChunk>) -> ExportResult<()> { ... }
140/// }
141/// ```
142pub trait Exporter: Send + Sync + 'static {
143    /// Export a single document chunk
144    ///
145    /// # Arguments
146    /// * `document` - The document chunk to export
147    ///
148    /// # Errors
149    /// Returns ExporterError if export fails
150    fn export(&self, document: DocumentChunk) -> ExportResult<()>;
151
152    /// Export multiple documents in batch
153    ///
154    /// This method is optimized for bulk operations and may:
155    /// - Batch I/O operations for better performance
156    /// - Use streaming writes for large datasets
157    /// - Maintain transaction semantics
158    ///
159    /// # Arguments
160    /// * `documents` - Collection of document chunks to export
161    ///
162    /// # Errors
163    /// Returns ExporterError if any document fails to export
164    fn export_batch(&self, documents: Vec<DocumentChunk>) -> ExportResult<()>;
165
166    /// Get the configuration for this exporter
167    fn config(&self) -> &ExporterConfig;
168
169    /// Get the format this exporter produces
170    fn format(&self) -> ExportFormat {
171        self.config().format
172    }
173}
174
175/// Extension trait for convenient exporter operations
176pub trait ExporterExt: Exporter {
177    /// Export a single document, converting from ScrapedContent
178    ///
179    /// Convenience method that handles the conversion from ScrapedContent
180    /// to DocumentChunk internally.
181    fn export_scraped(&self, scraped: &crate::domain::ScrapedContent) -> ExportResult<()> {
182        let chunk = DocumentChunk::from_scraped_content(scraped);
183        self.export(chunk)
184    }
185
186    /// Export multiple scraped contents in batch
187    fn export_scraped_batch(
188        &self,
189        scraped_contents: Vec<crate::domain::ScrapedContent>,
190    ) -> ExportResult<()> {
191        let chunks: Vec<DocumentChunk> = scraped_contents
192            .iter()
193            .map(DocumentChunk::from_scraped_content)
194            .collect();
195        self.export_batch(chunks)
196    }
197
198    /// Check if the exporter is configured to append
199    fn is_append_mode(&self) -> bool {
200        self.config().append
201    }
202
203    /// Get the output path
204    fn output_path(&self) -> PathBuf {
205        self.config().output_path()
206    }
207}
208
209impl<T: Exporter> ExporterExt for T {}
210
211#[cfg(test)]
212mod tests {
213    use std::path::PathBuf;
214
215    use super::*;
216
217    #[test]
218    fn test_export_format_extension() {
219        assert_eq!(ExportFormat::Markdown.extension(), "md");
220        assert_eq!(ExportFormat::Jsonl.extension(), "jsonl");
221        assert_eq!(ExportFormat::Zvec.extension(), "zvec");
222    }
223
224    #[test]
225    fn test_export_format_name() {
226        assert_eq!(ExportFormat::Markdown.name(), "Markdown");
227        assert_eq!(ExportFormat::Jsonl.name(), "JSONL");
228        assert_eq!(ExportFormat::Zvec.name(), "Zvec");
229    }
230
231    #[test]
232    fn test_exporter_config_default() {
233        let config = ExporterConfig::default();
234        assert_eq!(config.format, ExportFormat::Jsonl);
235        assert_eq!(config.filename, "export");
236        assert!(!config.append);
237    }
238
239    #[test]
240    fn test_exporter_config_output_path() {
241        let config = ExporterConfig::new(
242            PathBuf::from("/tmp/output"),
243            ExportFormat::Jsonl,
244            "test_export",
245        );
246        assert_eq!(
247            config.output_path(),
248            PathBuf::from("/tmp/output/test_export.jsonl")
249        );
250    }
251
252    #[test]
253    fn test_exporter_config_with_builder_pattern() {
254        let config = ExporterConfig::new(PathBuf::from("/data"), ExportFormat::Zvec, "my_data")
255            .with_append(true)
256            .with_batch_size(1000);
257
258        assert_eq!(config.output_dir, PathBuf::from("/data"));
259        assert_eq!(config.format, ExportFormat::Zvec);
260        assert_eq!(config.filename, "my_data");
261        assert!(config.append);
262        assert_eq!(config.batch_size, Some(1000));
263    }
264
265    #[test]
266    fn test_exporter_error_messages() {
267        let io_error = std::io::Error::new(std::io::ErrorKind::Other, "path error");
268        let err = ExporterError::DirectoryCreation(io_error);
269        assert!(err.to_string().to_lowercase().contains("directorio"));
270
271        let err = ExporterError::WriteError("disk full".to_string());
272        assert!(err.to_string().to_lowercase().contains("escritura"));
273
274        let err = ExporterError::InvalidConfig("missing path".to_string());
275        assert!(err.to_string().to_lowercase().contains("inválida"));
276    }
277}