1use std::io::Read;
2use std::path::Path;
3
4use bookforge_core::{
5 marker::marker_ids_in_text,
6 segment::{BlockTranslation, Segment},
7};
8
9#[derive(Debug, Clone, PartialEq, Eq)]
10pub struct EpubValidationReport {
11 pub xml_valid: bool,
12 pub files_checked: usize,
13 pub issues: Vec<EpubValidationIssue>,
14}
15
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct EpubValidationIssue {
18 pub severity: ValidationSeverity,
19 pub kind: String,
20 pub href: Option<String>,
21 pub block_id: Option<String>,
22 pub message: String,
23}
24
25#[derive(Debug, Clone, Copy, PartialEq, Eq)]
26pub enum ValidationSeverity {
27 Info,
28 Warning,
29 Error,
30}
31
32pub fn validate_translated_epub(
33 epub_path: &Path,
34 segments: &[Segment],
35 block_translations: &[BlockTranslation],
36) -> EpubValidationReport {
37 let mut report = EpubValidationReport {
38 xml_valid: true,
39 files_checked: 0,
40 issues: Vec::new(),
41 };
42
43 let Ok(file) = std::fs::File::open(epub_path) else {
44 report.issues.push(EpubValidationIssue {
45 severity: ValidationSeverity::Error,
46 kind: "epub_missing".to_string(),
47 href: None,
48 block_id: None,
49 message: format!("EPUB file not found: {}", epub_path.display()),
50 });
51 report.xml_valid = false;
52 return report;
53 };
54
55 match zip::ZipArchive::new(file) {
56 Ok(mut archive) => {
57 let mut file_names = Vec::new();
58 for i in 0..archive.len() {
59 if let Ok(entry) = archive.by_index(i) {
60 file_names.push(entry.name().to_string());
61 }
62 }
63 report.files_checked = file_names.len();
64
65 if !file_names.contains(&"mimetype".to_string()) {
66 report.issues.push(EpubValidationIssue {
67 severity: ValidationSeverity::Error,
68 kind: "missing_mimetype".to_string(),
69 href: None,
70 block_id: None,
71 message: "EPUB missing mimetype file".to_string(),
72 });
73 report.xml_valid = false;
74 }
75
76 if !file_names.contains(&"META-INF/container.xml".to_string()) {
77 report.issues.push(EpubValidationIssue {
78 severity: ValidationSeverity::Error,
79 kind: "missing_container".to_string(),
80 href: None,
81 block_id: None,
82 message: "EPUB missing META-INF/container.xml".to_string(),
83 });
84 report.xml_valid = false;
85 }
86
87 for name in &file_names {
88 if (name.ends_with(".xhtml") || name.ends_with(".html") || name.ends_with(".opf"))
89 && let Ok(mut entry) = archive.by_name(name)
90 {
91 let mut content = String::new();
92 if entry.read_to_string(&mut content).is_ok() {
93 validate_xhtml_content(
94 &mut report,
95 name,
96 &content,
97 segments,
98 block_translations,
99 );
100 }
101 }
102 }
103 }
104 Err(e) => {
105 report.issues.push(EpubValidationIssue {
106 severity: ValidationSeverity::Error,
107 kind: "zip_open_failed".to_string(),
108 href: None,
109 block_id: None,
110 message: format!("Failed to open EPUB as ZIP: {e}"),
111 });
112 report.xml_valid = false;
113 }
114 }
115
116 report
117}
118
119fn validate_xhtml_content(
120 report: &mut EpubValidationReport,
121 href: &str,
122 content: &str,
123 segments: &[Segment],
124 block_translations: &[BlockTranslation],
125) {
126 let by_block_id = block_translations
127 .iter()
128 .map(|bt| (bt.block_id.0.as_str(), bt.text.as_str()))
129 .collect::<std::collections::HashMap<_, _>>();
130
131 for segment in segments {
132 for block in &segment.source.blocks {
133 let translated = by_block_id
134 .get(block.block_id.0.as_str())
135 .copied()
136 .unwrap_or(&block.text);
137
138 let translated_markers = marker_ids_in_text(translated);
139 for marker in marker_ids_in_text(&block.text) {
140 if !translated_markers.contains(&marker) {
141 report.issues.push(EpubValidationIssue {
142 severity: ValidationSeverity::Error,
143 kind: "missing_marker".to_string(),
144 href: Some(href.to_string()),
145 block_id: Some(block.block_id.0.clone()),
146 message: format!(
147 "Required marker '{marker}' missing in translation of block {}",
148 block.block_id.0
149 ),
150 });
151 }
152 }
153
154 for span in &block.protected_spans {
155 if block.text.contains(span) && !translated.contains(span) {
156 report.issues.push(EpubValidationIssue {
157 severity: ValidationSeverity::Warning,
158 kind: "missing_protected_span".to_string(),
159 href: Some(href.to_string()),
160 block_id: Some(block.block_id.0.clone()),
161 message: format!(
162 "Protected span '{}' may be missing in block {}",
163 span, block.block_id.0
164 ),
165 });
166 }
167 }
168 }
169 }
170
171 if has_broken_xml(content) {
172 report.issues.push(EpubValidationIssue {
173 severity: ValidationSeverity::Error,
174 kind: "malformed_xhtml".to_string(),
175 href: Some(href.to_string()),
176 block_id: None,
177 message: format!("Malformed XHTML in {href}"),
178 });
179 report.xml_valid = false;
180 }
181}
182
183fn has_broken_xml(content: &str) -> bool {
184 use quick_xml::{Reader, events::Event};
185 let mut reader = Reader::from_str(content);
186 reader.config_mut().trim_text(false);
187 loop {
188 match reader.read_event() {
189 Ok(Event::Eof) => return false,
190 Ok(_) => continue,
191 Err(_) => return true,
192 }
193 }
194}
195
196pub fn validate_block_translations(
197 segments: &[Segment],
198 block_translations: &[BlockTranslation],
199) -> Vec<EpubValidationIssue> {
200 let mut issues = Vec::new();
201 let by_block_id = block_translations
202 .iter()
203 .map(|bt| (bt.block_id.0.as_str(), bt.text.as_str()))
204 .collect::<std::collections::HashMap<_, _>>();
205
206 for segment in segments {
207 for block in &segment.source.blocks {
208 let translated = by_block_id
209 .get(block.block_id.0.as_str())
210 .copied()
211 .unwrap_or(&block.text);
212
213 if block.text.is_empty() && translated.is_empty() {
214 continue;
215 }
216
217 if translated.is_empty() && !block.text.is_empty() {
218 issues.push(EpubValidationIssue {
219 severity: ValidationSeverity::Error,
220 kind: "empty_translation".to_string(),
221 href: None,
222 block_id: Some(block.block_id.0.clone()),
223 message: format!("Block {} has empty translation", block.block_id.0),
224 });
225 }
226
227 let source_len = block.text.chars().count().max(1);
228 let trans_len = translated.chars().count();
229 let ratio = trans_len as f64 / source_len as f64;
230 if !(0.1..=5.0).contains(&ratio) {
231 issues.push(EpubValidationIssue {
232 severity: ValidationSeverity::Warning,
233 kind: "suspicious_length_ratio".to_string(),
234 href: None,
235 block_id: Some(block.block_id.0.clone()),
236 message: format!(
237 "Suspicious length ratio {:.2} for block {} (source={source_len}, translation={trans_len})",
238 ratio, block.block_id.0
239 ),
240 });
241 }
242 }
243 }
244
245 issues
246}
247
248#[cfg(test)]
249mod tests {
250 use super::*;
251 #[test]
252 fn rejects_broken_xml() {
253 assert!(has_broken_xml("<p><b>text</p></b>"));
254 }
255
256 #[test]
257 fn accepts_well_formed_xml() {
258 assert!(!has_broken_xml("<p>Hello <b>world</b></p>"));
259 }
260
261 #[test]
262 fn accepts_self_closing_tags_with_attributes() {
263 assert!(!has_broken_xml(
264 r#"<root><img src="a.png" alt="x"/><br/></root>"#,
265 ));
266 }
267
268 #[test]
269 fn accepts_namespaced_xhtml() {
270 let xhtml = r#"<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"><body><p epub:type="chapter">x</p></body></html>"#;
271 assert!(!has_broken_xml(xhtml));
272 }
273
274 #[test]
275 fn accepts_comments_and_processing_instructions() {
276 let xml = "<?xml version=\"1.0\"?><!-- a comment --><root><a/></root>";
277 assert!(!has_broken_xml(xml));
278 }
279
280 #[test]
281 fn detects_empty_translation() {
282 use bookforge_core::ir::SectionId;
283 use bookforge_core::segment::{
284 Segment, SegmentBlock, SegmentConstraints, SegmentContext, SegmentId, SegmentMetadata,
285 SegmentSource, SegmentTextRun,
286 };
287
288 let block_id = bookforge_core::ir::BlockId("b_01".to_string());
289 let segment = Segment {
290 id: SegmentId("seg_01".to_string()),
291 section_id: SectionId("sec_01".to_string()),
292 ordinal: 0,
293 block_ids: vec![block_id.clone()],
294 source: SegmentSource {
295 text: "Hello".to_string(),
296 blocks: vec![SegmentBlock {
297 block_id: block_id.clone(),
298 kind: "p".to_string(),
299 text: "Hello".to_string(),
300 text_runs: vec![SegmentTextRun {
301 id: "r0".to_string(),
302 text: "Hello".to_string(),
303 }],
304 protected_spans: vec![],
305 }],
306 token_estimate: 1,
307 },
308 context: SegmentContext::default(),
309 metadata: SegmentMetadata::default(),
310 constraints: SegmentConstraints::default(),
311 checksum: "abc".to_string(),
312 };
313 let issues = validate_block_translations(
314 &[segment],
315 &[BlockTranslation {
316 block_id: block_id.clone(),
317 text: String::new(),
318 }],
319 );
320 assert!(!issues.is_empty());
321 }
322}