rust_scraper/domain/
exporter.rs1use std::path::PathBuf;
7
8use serde::{Deserialize, Serialize};
9use thiserror::Error;
10
11use crate::domain::entities::{DocumentChunk, ExportFormat};
12
13#[derive(Error, Debug)]
15pub enum ExporterError {
16 #[error("No se pudo crear el directorio de salida: {0}")]
18 DirectoryCreation(#[from] std::io::Error),
19
20 #[error("Error de escritura: {0}")]
22 WriteError(String),
23
24 #[error("Configuración inválida: {0}")]
26 InvalidConfig(String),
27
28 #[error("Error de serialización: {0}")]
30 Serialization(#[from] serde_json::Error),
31
32 #[error("Error en batch: {0}")]
34 BatchError(String),
35
36 #[error("Error en state store: {0}")]
38 StateStore(#[from] crate::error::ScraperError),
39}
40
41pub type ExportResult<T> = std::result::Result<T, ExporterError>;
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct ExporterConfig {
50 pub output_dir: PathBuf,
52 pub format: ExportFormat,
54 pub filename: String,
56 pub append: bool,
58 pub batch_size: Option<usize>,
60}
61
62impl ExporterConfig {
63 pub fn new(output_dir: PathBuf, format: ExportFormat, filename: impl Into<String>) -> Self {
68 Self {
69 output_dir,
70 format,
71 filename: filename.into(),
72 append: false,
73 batch_size: None,
74 }
75 }
76
77 #[must_use]
79 pub fn with_append(mut self, append: bool) -> Self {
80 self.append = append;
81 self
82 }
83
84 #[must_use]
86 pub fn with_batch_size(mut self, size: usize) -> Self {
87 self.batch_size = Some(size);
88 self
89 }
90
91 #[must_use]
93 pub fn output_path(&self) -> PathBuf {
94 let ext = self.format.extension();
95 self.output_dir.join(format!("{}.{}", self.filename, ext))
96 }
97
98 #[must_use]
100 pub fn state_path(&self) -> PathBuf {
101 let state_dir = self.output_dir.join("state");
102 let domain = self.filename.clone();
104 state_dir.join(format!("{}.json", domain))
105 }
106}
107
108impl Default for ExporterConfig {
110 fn default() -> Self {
111 Self {
112 output_dir: PathBuf::from("./output"),
113 format: ExportFormat::Jsonl,
114 filename: "export".to_string(),
115 append: false,
116 batch_size: None,
117 }
118 }
119}
120
121pub trait Exporter: Send + Sync + 'static {
143 fn export(&self, document: DocumentChunk) -> ExportResult<()>;
151
152 fn export_batch(&self, documents: Vec<DocumentChunk>) -> ExportResult<()>;
165
166 fn config(&self) -> &ExporterConfig;
168
169 fn format(&self) -> ExportFormat {
171 self.config().format
172 }
173}
174
175pub trait ExporterExt: Exporter {
177 fn export_scraped(&self, scraped: &crate::domain::ScrapedContent) -> ExportResult<()> {
182 let chunk = DocumentChunk::from_scraped_content(scraped);
183 self.export(chunk)
184 }
185
186 fn export_scraped_batch(
188 &self,
189 scraped_contents: Vec<crate::domain::ScrapedContent>,
190 ) -> ExportResult<()> {
191 let chunks: Vec<DocumentChunk> = scraped_contents
192 .iter()
193 .map(DocumentChunk::from_scraped_content)
194 .collect();
195 self.export_batch(chunks)
196 }
197
198 fn is_append_mode(&self) -> bool {
200 self.config().append
201 }
202
203 fn output_path(&self) -> PathBuf {
205 self.config().output_path()
206 }
207}
208
209impl<T: Exporter> ExporterExt for T {}
210
211#[cfg(test)]
212mod tests {
213 use std::path::PathBuf;
214
215 use super::*;
216
217 #[test]
218 fn test_export_format_extension() {
219 assert_eq!(ExportFormat::Markdown.extension(), "md");
220 assert_eq!(ExportFormat::Jsonl.extension(), "jsonl");
221 assert_eq!(ExportFormat::Zvec.extension(), "zvec");
222 }
223
224 #[test]
225 fn test_export_format_name() {
226 assert_eq!(ExportFormat::Markdown.name(), "Markdown");
227 assert_eq!(ExportFormat::Jsonl.name(), "JSONL");
228 assert_eq!(ExportFormat::Zvec.name(), "Zvec");
229 }
230
231 #[test]
232 fn test_exporter_config_default() {
233 let config = ExporterConfig::default();
234 assert_eq!(config.format, ExportFormat::Jsonl);
235 assert_eq!(config.filename, "export");
236 assert!(!config.append);
237 }
238
239 #[test]
240 fn test_exporter_config_output_path() {
241 let config = ExporterConfig::new(
242 PathBuf::from("/tmp/output"),
243 ExportFormat::Jsonl,
244 "test_export",
245 );
246 assert_eq!(
247 config.output_path(),
248 PathBuf::from("/tmp/output/test_export.jsonl")
249 );
250 }
251
252 #[test]
253 fn test_exporter_config_with_builder_pattern() {
254 let config = ExporterConfig::new(PathBuf::from("/data"), ExportFormat::Zvec, "my_data")
255 .with_append(true)
256 .with_batch_size(1000);
257
258 assert_eq!(config.output_dir, PathBuf::from("/data"));
259 assert_eq!(config.format, ExportFormat::Zvec);
260 assert_eq!(config.filename, "my_data");
261 assert!(config.append);
262 assert_eq!(config.batch_size, Some(1000));
263 }
264
265 #[test]
266 fn test_exporter_error_messages() {
267 let io_error = std::io::Error::new(std::io::ErrorKind::Other, "path error");
268 let err = ExporterError::DirectoryCreation(io_error);
269 assert!(err.to_string().to_lowercase().contains("directorio"));
270
271 let err = ExporterError::WriteError("disk full".to_string());
272 assert!(err.to_string().to_lowercase().contains("escritura"));
273
274 let err = ExporterError::InvalidConfig("missing path".to_string());
275 assert!(err.to_string().to_lowercase().contains("inválida"));
276 }
277}