Skip to main content

web_capture/
batch.rs

1//! Batch processing and configuration module (R7).
2//!
3//! Supports processing multiple URLs from a configuration file.
4//! Configuration format matches the `articles-config` pattern from meta-theory.
5//!
6//! Based on reference implementation from:
7//! <https://github.com/link-foundation/meta-theory/blob/main/scripts/articles-config.mjs>
8
9use serde::{Deserialize, Serialize};
10use std::collections::BTreeMap;
11use std::path::Path;
12use url::Url;
13
14/// Configuration for a single article.
15#[derive(Debug, Clone, Default, Serialize, Deserialize)]
16#[serde(rename_all = "camelCase")]
17pub struct ArticleConfig {
18    pub url: String,
19    #[serde(skip_serializing_if = "Option::is_none")]
20    pub title: Option<String>,
21    #[serde(skip_serializing_if = "Option::is_none")]
22    pub language: Option<String>,
23    #[serde(skip_serializing_if = "Option::is_none")]
24    pub archive_path: Option<String>,
25    #[serde(skip_serializing_if = "Option::is_none")]
26    pub markdown_file: Option<String>,
27    #[serde(skip_serializing_if = "Option::is_none")]
28    pub screenshot_light_file: Option<String>,
29    #[serde(skip_serializing_if = "Option::is_none")]
30    pub screenshot_dark_file: Option<String>,
31    #[serde(skip_serializing_if = "Option::is_none")]
32    pub images_dir: Option<String>,
33    #[serde(skip_serializing_if = "Option::is_none")]
34    pub has_local_images: Option<bool>,
35    #[serde(skip_serializing_if = "Option::is_none")]
36    pub expected_figures: Option<u32>,
37    #[serde(skip_serializing_if = "Option::is_none")]
38    pub format: Option<String>,
39}
40
41/// Batch configuration containing multiple articles and defaults.
42#[derive(Debug, Clone, Default, Serialize, Deserialize)]
43pub struct BatchConfig {
44    pub articles: BTreeMap<String, ArticleConfig>,
45    #[serde(skip_serializing_if = "Option::is_none")]
46    pub defaults: Option<ArticleConfig>,
47}
48
49/// Validation result for batch configuration.
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct ValidationResult {
52    pub valid: bool,
53    pub errors: Vec<String>,
54}
55
56/// Load batch configuration from a JSON file.
57///
58/// # Errors
59///
60/// Returns an error if the file cannot be read or parsed.
61pub fn load_config(config_path: &str) -> Result<BatchConfig, String> {
62    let path = Path::new(config_path);
63    let ext = path
64        .extension()
65        .and_then(|e| e.to_str())
66        .unwrap_or("")
67        .to_lowercase();
68
69    match ext.as_str() {
70        "json" => {
71            let content =
72                std::fs::read_to_string(config_path).map_err(|e| format!("Read error: {e}"))?;
73            serde_json::from_str(&content).map_err(|e| format!("Parse error: {e}"))
74        }
75        _ => Err(format!(
76            "Unsupported config format: .{ext}. Use .json (Rust does not support dynamic ESM imports)"
77        )),
78    }
79}
80
81/// Get article configuration by version/id.
82///
83/// Merges defaults with the specific article configuration.
84///
85/// # Errors
86///
87/// Returns an error if the version is not found.
88pub fn get_article(config: &BatchConfig, version: &str) -> Result<ArticleConfig, String> {
89    let article = config.articles.get(version).ok_or_else(|| {
90        let available: Vec<&String> = config.articles.keys().collect();
91        format!(
92            "Unknown article version: {version}. Available: {}",
93            available
94                .iter()
95                .map(|s| s.as_str())
96                .collect::<Vec<_>>()
97                .join(", ")
98        )
99    })?;
100
101    // Merge defaults
102    Ok(config.defaults.as_ref().map_or_else(
103        || article.clone(),
104        |defaults| merge_config(defaults, article),
105    ))
106}
107
108/// Get all article versions from configuration.
109#[must_use]
110pub fn get_all_versions(config: &BatchConfig) -> Vec<String> {
111    config.articles.keys().cloned().collect()
112}
113
114/// Get all article configurations with defaults merged.
115#[must_use]
116pub fn get_all_articles(config: &BatchConfig) -> Vec<ArticleConfig> {
117    config
118        .articles
119        .values()
120        .map(|article| {
121            config.defaults.as_ref().map_or_else(
122                || article.clone(),
123                |defaults| merge_config(defaults, article),
124            )
125        })
126        .collect()
127}
128
129/// Create a default batch configuration for a list of URLs.
130#[must_use]
131pub fn create_config_from_urls(urls: &[String], defaults: Option<ArticleConfig>) -> BatchConfig {
132    let mut articles = BTreeMap::new();
133
134    for (index, url) in urls.iter().enumerate() {
135        let id = (index + 1).to_string();
136        let hostname = Url::parse(url)
137            .ok()
138            .and_then(|u| u.host_str().map(String::from))
139            .unwrap_or_else(|| "article".to_string());
140
141        articles.insert(
142            id.clone(),
143            ArticleConfig {
144                url: url.clone(),
145                title: Some(format!("Article {id}")),
146                archive_path: Some(format!("archive/{}/{id}", hostname.replace('.', "-"))),
147                markdown_file: Some("article.md".to_string()),
148                screenshot_light_file: Some("article-light.png".to_string()),
149                screenshot_dark_file: Some("article-dark.png".to_string()),
150                images_dir: Some("images".to_string()),
151                has_local_images: Some(true),
152                ..Default::default()
153            },
154        );
155    }
156
157    BatchConfig { articles, defaults }
158}
159
160/// Validate a batch configuration.
161#[must_use]
162pub fn validate_config(config: &BatchConfig) -> ValidationResult {
163    let mut errors = Vec::new();
164
165    if config.articles.is_empty() {
166        errors.push("Configuration must have at least one article".to_string());
167        return ValidationResult {
168            valid: false,
169            errors,
170        };
171    }
172
173    for (id, article) in &config.articles {
174        if article.url.is_empty() {
175            errors.push(format!("Article \"{id}\" missing required \"url\" field"));
176        } else if Url::parse(&article.url).is_err() {
177            errors.push(format!("Article \"{id}\" has invalid URL: {}", article.url));
178        }
179    }
180
181    ValidationResult {
182        valid: errors.is_empty(),
183        errors,
184    }
185}
186
187/// Merge defaults into an article config (article values take precedence).
188fn merge_config(defaults: &ArticleConfig, article: &ArticleConfig) -> ArticleConfig {
189    ArticleConfig {
190        url: if article.url.is_empty() {
191            defaults.url.clone()
192        } else {
193            article.url.clone()
194        },
195        title: article.title.clone().or_else(|| defaults.title.clone()),
196        language: article
197            .language
198            .clone()
199            .or_else(|| defaults.language.clone()),
200        archive_path: article
201            .archive_path
202            .clone()
203            .or_else(|| defaults.archive_path.clone()),
204        markdown_file: article
205            .markdown_file
206            .clone()
207            .or_else(|| defaults.markdown_file.clone()),
208        screenshot_light_file: article
209            .screenshot_light_file
210            .clone()
211            .or_else(|| defaults.screenshot_light_file.clone()),
212        screenshot_dark_file: article
213            .screenshot_dark_file
214            .clone()
215            .or_else(|| defaults.screenshot_dark_file.clone()),
216        images_dir: article
217            .images_dir
218            .clone()
219            .or_else(|| defaults.images_dir.clone()),
220        has_local_images: article.has_local_images.or(defaults.has_local_images),
221        expected_figures: article.expected_figures.or(defaults.expected_figures),
222        format: article.format.clone().or_else(|| defaults.format.clone()),
223    }
224}