use std::io::Read;
use std::path::Path;
use bookforge_core::{
marker::marker_ids_in_text,
segment::{BlockTranslation, Segment},
};
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct EpubValidationReport {
pub xml_valid: bool,
pub files_checked: usize,
pub issues: Vec<EpubValidationIssue>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct EpubValidationIssue {
pub severity: ValidationSeverity,
pub kind: String,
pub href: Option<String>,
pub block_id: Option<String>,
pub message: String,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ValidationSeverity {
Info,
Warning,
Error,
}
pub fn validate_translated_epub(
epub_path: &Path,
segments: &[Segment],
block_translations: &[BlockTranslation],
) -> EpubValidationReport {
let mut report = EpubValidationReport {
xml_valid: true,
files_checked: 0,
issues: Vec::new(),
};
let Ok(file) = std::fs::File::open(epub_path) else {
report.issues.push(EpubValidationIssue {
severity: ValidationSeverity::Error,
kind: "epub_missing".to_string(),
href: None,
block_id: None,
message: format!("EPUB file not found: {}", epub_path.display()),
});
report.xml_valid = false;
return report;
};
match zip::ZipArchive::new(file) {
Ok(mut archive) => {
let mut file_names = Vec::new();
for i in 0..archive.len() {
if let Ok(entry) = archive.by_index(i) {
file_names.push(entry.name().to_string());
}
}
report.files_checked = file_names.len();
if !file_names.contains(&"mimetype".to_string()) {
report.issues.push(EpubValidationIssue {
severity: ValidationSeverity::Error,
kind: "missing_mimetype".to_string(),
href: None,
block_id: None,
message: "EPUB missing mimetype file".to_string(),
});
report.xml_valid = false;
}
if !file_names.contains(&"META-INF/container.xml".to_string()) {
report.issues.push(EpubValidationIssue {
severity: ValidationSeverity::Error,
kind: "missing_container".to_string(),
href: None,
block_id: None,
message: "EPUB missing META-INF/container.xml".to_string(),
});
report.xml_valid = false;
}
for name in &file_names {
if (name.ends_with(".xhtml") || name.ends_with(".html") || name.ends_with(".opf"))
&& let Ok(mut entry) = archive.by_name(name)
{
let mut content = String::new();
if entry.read_to_string(&mut content).is_ok() {
validate_xhtml_content(
&mut report,
name,
&content,
segments,
block_translations,
);
}
}
}
}
Err(e) => {
report.issues.push(EpubValidationIssue {
severity: ValidationSeverity::Error,
kind: "zip_open_failed".to_string(),
href: None,
block_id: None,
message: format!("Failed to open EPUB as ZIP: {e}"),
});
report.xml_valid = false;
}
}
report
}
fn validate_xhtml_content(
report: &mut EpubValidationReport,
href: &str,
content: &str,
segments: &[Segment],
block_translations: &[BlockTranslation],
) {
let by_block_id = block_translations
.iter()
.map(|bt| (bt.block_id.0.as_str(), bt.text.as_str()))
.collect::<std::collections::HashMap<_, _>>();
for segment in segments {
for block in &segment.source.blocks {
let translated = by_block_id
.get(block.block_id.0.as_str())
.copied()
.unwrap_or(&block.text);
let translated_markers = marker_ids_in_text(translated);
for marker in marker_ids_in_text(&block.text) {
if !translated_markers.contains(&marker) {
report.issues.push(EpubValidationIssue {
severity: ValidationSeverity::Error,
kind: "missing_marker".to_string(),
href: Some(href.to_string()),
block_id: Some(block.block_id.0.clone()),
message: format!(
"Required marker '{marker}' missing in translation of block {}",
block.block_id.0
),
});
}
}
for span in &block.protected_spans {
if block.text.contains(span) && !translated.contains(span) {
report.issues.push(EpubValidationIssue {
severity: ValidationSeverity::Warning,
kind: "missing_protected_span".to_string(),
href: Some(href.to_string()),
block_id: Some(block.block_id.0.clone()),
message: format!(
"Protected span '{}' may be missing in block {}",
span, block.block_id.0
),
});
}
}
}
}
if has_broken_xml(content) {
report.issues.push(EpubValidationIssue {
severity: ValidationSeverity::Error,
kind: "malformed_xhtml".to_string(),
href: Some(href.to_string()),
block_id: None,
message: format!("Malformed XHTML in {href}"),
});
report.xml_valid = false;
}
}
fn has_broken_xml(content: &str) -> bool {
use quick_xml::{Reader, events::Event};
let mut reader = Reader::from_str(content);
reader.config_mut().trim_text(false);
loop {
match reader.read_event() {
Ok(Event::Eof) => return false,
Ok(_) => continue,
Err(_) => return true,
}
}
}
pub fn validate_block_translations(
segments: &[Segment],
block_translations: &[BlockTranslation],
) -> Vec<EpubValidationIssue> {
let mut issues = Vec::new();
let by_block_id = block_translations
.iter()
.map(|bt| (bt.block_id.0.as_str(), bt.text.as_str()))
.collect::<std::collections::HashMap<_, _>>();
for segment in segments {
for block in &segment.source.blocks {
let translated = by_block_id
.get(block.block_id.0.as_str())
.copied()
.unwrap_or(&block.text);
if block.text.is_empty() && translated.is_empty() {
continue;
}
if translated.is_empty() && !block.text.is_empty() {
issues.push(EpubValidationIssue {
severity: ValidationSeverity::Error,
kind: "empty_translation".to_string(),
href: None,
block_id: Some(block.block_id.0.clone()),
message: format!("Block {} has empty translation", block.block_id.0),
});
}
let source_len = block.text.chars().count().max(1);
let trans_len = translated.chars().count();
let ratio = trans_len as f64 / source_len as f64;
if !(0.1..=5.0).contains(&ratio) {
issues.push(EpubValidationIssue {
severity: ValidationSeverity::Warning,
kind: "suspicious_length_ratio".to_string(),
href: None,
block_id: Some(block.block_id.0.clone()),
message: format!(
"Suspicious length ratio {:.2} for block {} (source={source_len}, translation={trans_len})",
ratio, block.block_id.0
),
});
}
}
}
issues
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn rejects_broken_xml() {
assert!(has_broken_xml("<p><b>text</p></b>"));
}
#[test]
fn accepts_well_formed_xml() {
assert!(!has_broken_xml("<p>Hello <b>world</b></p>"));
}
#[test]
fn accepts_self_closing_tags_with_attributes() {
assert!(!has_broken_xml(
r#"<root><img src="a.png" alt="x"/><br/></root>"#,
));
}
#[test]
fn accepts_namespaced_xhtml() {
let xhtml = r#"<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"><body><p epub:type="chapter">x</p></body></html>"#;
assert!(!has_broken_xml(xhtml));
}
#[test]
fn accepts_comments_and_processing_instructions() {
let xml = "<?xml version=\"1.0\"?><!-- a comment --><root><a/></root>";
assert!(!has_broken_xml(xml));
}
#[test]
fn detects_empty_translation() {
use bookforge_core::ir::SectionId;
use bookforge_core::segment::{
Segment, SegmentBlock, SegmentConstraints, SegmentContext, SegmentId, SegmentMetadata,
SegmentSource, SegmentTextRun,
};
let block_id = bookforge_core::ir::BlockId("b_01".to_string());
let segment = Segment {
id: SegmentId("seg_01".to_string()),
section_id: SectionId("sec_01".to_string()),
ordinal: 0,
block_ids: vec![block_id.clone()],
source: SegmentSource {
text: "Hello".to_string(),
blocks: vec![SegmentBlock {
block_id: block_id.clone(),
kind: "p".to_string(),
text: "Hello".to_string(),
text_runs: vec![SegmentTextRun {
id: "r0".to_string(),
text: "Hello".to_string(),
}],
protected_spans: vec![],
}],
token_estimate: 1,
},
context: SegmentContext::default(),
metadata: SegmentMetadata::default(),
constraints: SegmentConstraints::default(),
checksum: "abc".to_string(),
};
let issues = validate_block_translations(
&[segment],
&[BlockTranslation {
block_id: block_id.clone(),
text: String::new(),
}],
);
assert!(!issues.is_empty());
}
}