Skip to main content

bookforge_epub/
validate.rs

1use std::io::Read;
2use std::path::Path;
3
4use bookforge_core::{
5    marker::marker_ids_in_text,
6    segment::{BlockTranslation, Segment},
7};
8
9#[derive(Debug, Clone, PartialEq, Eq)]
10pub struct EpubValidationReport {
11    pub xml_valid: bool,
12    pub files_checked: usize,
13    pub issues: Vec<EpubValidationIssue>,
14}
15
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct EpubValidationIssue {
18    pub severity: ValidationSeverity,
19    pub kind: String,
20    pub href: Option<String>,
21    pub block_id: Option<String>,
22    pub message: String,
23}
24
25#[derive(Debug, Clone, Copy, PartialEq, Eq)]
26pub enum ValidationSeverity {
27    Info,
28    Warning,
29    Error,
30}
31
32pub fn validate_translated_epub(
33    epub_path: &Path,
34    segments: &[Segment],
35    block_translations: &[BlockTranslation],
36) -> EpubValidationReport {
37    let mut report = EpubValidationReport {
38        xml_valid: true,
39        files_checked: 0,
40        issues: Vec::new(),
41    };
42
43    let Ok(file) = std::fs::File::open(epub_path) else {
44        report.issues.push(EpubValidationIssue {
45            severity: ValidationSeverity::Error,
46            kind: "epub_missing".to_string(),
47            href: None,
48            block_id: None,
49            message: format!("EPUB file not found: {}", epub_path.display()),
50        });
51        report.xml_valid = false;
52        return report;
53    };
54
55    match zip::ZipArchive::new(file) {
56        Ok(mut archive) => {
57            let mut file_names = Vec::new();
58            for i in 0..archive.len() {
59                if let Ok(entry) = archive.by_index(i) {
60                    file_names.push(entry.name().to_string());
61                }
62            }
63            report.files_checked = file_names.len();
64
65            if !file_names.contains(&"mimetype".to_string()) {
66                report.issues.push(EpubValidationIssue {
67                    severity: ValidationSeverity::Error,
68                    kind: "missing_mimetype".to_string(),
69                    href: None,
70                    block_id: None,
71                    message: "EPUB missing mimetype file".to_string(),
72                });
73                report.xml_valid = false;
74            }
75
76            if !file_names.contains(&"META-INF/container.xml".to_string()) {
77                report.issues.push(EpubValidationIssue {
78                    severity: ValidationSeverity::Error,
79                    kind: "missing_container".to_string(),
80                    href: None,
81                    block_id: None,
82                    message: "EPUB missing META-INF/container.xml".to_string(),
83                });
84                report.xml_valid = false;
85            }
86
87            for name in &file_names {
88                if (name.ends_with(".xhtml") || name.ends_with(".html") || name.ends_with(".opf"))
89                    && let Ok(mut entry) = archive.by_name(name)
90                {
91                    let mut content = String::new();
92                    if entry.read_to_string(&mut content).is_ok() {
93                        validate_xhtml_content(
94                            &mut report,
95                            name,
96                            &content,
97                            segments,
98                            block_translations,
99                        );
100                    }
101                }
102            }
103        }
104        Err(e) => {
105            report.issues.push(EpubValidationIssue {
106                severity: ValidationSeverity::Error,
107                kind: "zip_open_failed".to_string(),
108                href: None,
109                block_id: None,
110                message: format!("Failed to open EPUB as ZIP: {e}"),
111            });
112            report.xml_valid = false;
113        }
114    }
115
116    report
117}
118
119fn validate_xhtml_content(
120    report: &mut EpubValidationReport,
121    href: &str,
122    content: &str,
123    segments: &[Segment],
124    block_translations: &[BlockTranslation],
125) {
126    let by_block_id = block_translations
127        .iter()
128        .map(|bt| (bt.block_id.0.as_str(), bt.text.as_str()))
129        .collect::<std::collections::HashMap<_, _>>();
130
131    for segment in segments {
132        for block in &segment.source.blocks {
133            let translated = by_block_id
134                .get(block.block_id.0.as_str())
135                .copied()
136                .unwrap_or(&block.text);
137
138            let translated_markers = marker_ids_in_text(translated);
139            for marker in marker_ids_in_text(&block.text) {
140                if !translated_markers.contains(&marker) {
141                    report.issues.push(EpubValidationIssue {
142                        severity: ValidationSeverity::Error,
143                        kind: "missing_marker".to_string(),
144                        href: Some(href.to_string()),
145                        block_id: Some(block.block_id.0.clone()),
146                        message: format!(
147                            "Required marker '{marker}' missing in translation of block {}",
148                            block.block_id.0
149                        ),
150                    });
151                }
152            }
153
154            for span in &block.protected_spans {
155                if block.text.contains(span) && !translated.contains(span) {
156                    report.issues.push(EpubValidationIssue {
157                        severity: ValidationSeverity::Warning,
158                        kind: "missing_protected_span".to_string(),
159                        href: Some(href.to_string()),
160                        block_id: Some(block.block_id.0.clone()),
161                        message: format!(
162                            "Protected span '{}' may be missing in block {}",
163                            span, block.block_id.0
164                        ),
165                    });
166                }
167            }
168        }
169    }
170
171    if has_broken_xml(content) {
172        report.issues.push(EpubValidationIssue {
173            severity: ValidationSeverity::Error,
174            kind: "malformed_xhtml".to_string(),
175            href: Some(href.to_string()),
176            block_id: None,
177            message: format!("Malformed XHTML in {href}"),
178        });
179        report.xml_valid = false;
180    }
181}
182
183fn has_broken_xml(content: &str) -> bool {
184    use quick_xml::{Reader, events::Event};
185    let mut reader = Reader::from_str(content);
186    reader.config_mut().trim_text(false);
187    loop {
188        match reader.read_event() {
189            Ok(Event::Eof) => return false,
190            Ok(_) => continue,
191            Err(_) => return true,
192        }
193    }
194}
195
196pub fn validate_block_translations(
197    segments: &[Segment],
198    block_translations: &[BlockTranslation],
199) -> Vec<EpubValidationIssue> {
200    let mut issues = Vec::new();
201    let by_block_id = block_translations
202        .iter()
203        .map(|bt| (bt.block_id.0.as_str(), bt.text.as_str()))
204        .collect::<std::collections::HashMap<_, _>>();
205
206    for segment in segments {
207        for block in &segment.source.blocks {
208            let translated = by_block_id
209                .get(block.block_id.0.as_str())
210                .copied()
211                .unwrap_or(&block.text);
212
213            if block.text.is_empty() && translated.is_empty() {
214                continue;
215            }
216
217            if translated.is_empty() && !block.text.is_empty() {
218                issues.push(EpubValidationIssue {
219                    severity: ValidationSeverity::Error,
220                    kind: "empty_translation".to_string(),
221                    href: None,
222                    block_id: Some(block.block_id.0.clone()),
223                    message: format!("Block {} has empty translation", block.block_id.0),
224                });
225            }
226
227            let source_len = block.text.chars().count().max(1);
228            let trans_len = translated.chars().count();
229            let ratio = trans_len as f64 / source_len as f64;
230            if !(0.1..=5.0).contains(&ratio) {
231                issues.push(EpubValidationIssue {
232                    severity: ValidationSeverity::Warning,
233                    kind: "suspicious_length_ratio".to_string(),
234                    href: None,
235                    block_id: Some(block.block_id.0.clone()),
236                    message: format!(
237                        "Suspicious length ratio {:.2} for block {} (source={source_len}, translation={trans_len})",
238                        ratio, block.block_id.0
239                    ),
240                });
241            }
242        }
243    }
244
245    issues
246}
247
248#[cfg(test)]
249mod tests {
250    use super::*;
251    #[test]
252    fn rejects_broken_xml() {
253        assert!(has_broken_xml("<p><b>text</p></b>"));
254    }
255
256    #[test]
257    fn accepts_well_formed_xml() {
258        assert!(!has_broken_xml("<p>Hello <b>world</b></p>"));
259    }
260
261    #[test]
262    fn accepts_self_closing_tags_with_attributes() {
263        assert!(!has_broken_xml(
264            r#"<root><img src="a.png" alt="x"/><br/></root>"#,
265        ));
266    }
267
268    #[test]
269    fn accepts_namespaced_xhtml() {
270        let xhtml = r#"<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"><body><p epub:type="chapter">x</p></body></html>"#;
271        assert!(!has_broken_xml(xhtml));
272    }
273
274    #[test]
275    fn accepts_comments_and_processing_instructions() {
276        let xml = "<?xml version=\"1.0\"?><!-- a comment --><root><a/></root>";
277        assert!(!has_broken_xml(xml));
278    }
279
280    #[test]
281    fn detects_empty_translation() {
282        use bookforge_core::ir::SectionId;
283        use bookforge_core::segment::{
284            Segment, SegmentBlock, SegmentConstraints, SegmentContext, SegmentId, SegmentMetadata,
285            SegmentSource, SegmentTextRun,
286        };
287
288        let block_id = bookforge_core::ir::BlockId("b_01".to_string());
289        let segment = Segment {
290            id: SegmentId("seg_01".to_string()),
291            section_id: SectionId("sec_01".to_string()),
292            ordinal: 0,
293            block_ids: vec![block_id.clone()],
294            source: SegmentSource {
295                text: "Hello".to_string(),
296                blocks: vec![SegmentBlock {
297                    block_id: block_id.clone(),
298                    kind: "p".to_string(),
299                    text: "Hello".to_string(),
300                    text_runs: vec![SegmentTextRun {
301                        id: "r0".to_string(),
302                        text: "Hello".to_string(),
303                    }],
304                    protected_spans: vec![],
305                }],
306                token_estimate: 1,
307            },
308            context: SegmentContext::default(),
309            metadata: SegmentMetadata::default(),
310            constraints: SegmentConstraints::default(),
311            checksum: "abc".to_string(),
312        };
313        let issues = validate_block_translations(
314            &[segment],
315            &[BlockTranslation {
316                block_id: block_id.clone(),
317                text: String::new(),
318            }],
319        );
320        assert!(!issues.is_empty());
321    }
322}