use std::collections::HashMap;
use std::fs;
use std::path::{Path, PathBuf};
use super::helpers::extract_attr;
use super::Check;
use crate::extracted::ExtractedEpub;
use crate::validate::ValidationReport;
pub struct TocExtrasChecks;
impl Check for TocExtrasChecks {
fn ids(&self) -> &'static [&'static str] {
&[
"R5.4", "R5.5", "R5.6", "R5.7", "R5.8", "R5.9", "R5.10", "R5.11",
]
}
fn run(&self, epub: &ExtractedEpub, report: &mut ValidationReport) {
let opf = &epub.opf;
let href_to_media: HashMap<String, String> = opf
.manifest
.values()
.map(|(href, mt)| (href.clone(), mt.clone()))
.collect();
let nav_href = find_nav_doc_href(epub);
let ncx_href: Option<String> = opf
.manifest
.values()
.find(|(_, mt)| mt == "application/x-dtbncx+xml")
.map(|(href, _)| href.clone());
let opf_text = fs::read_to_string(&epub.opf_path).unwrap_or_default();
if let Some(ref nav) = nav_href {
let nav_path = opf.base_dir.join(nav);
if spine_has_pagebreak(epub) {
if let Ok(nav_text) = fs::read_to_string(&nav_path) {
if !nav_has_page_list(&nav_text) {
report.emit_at(
"R5.4",
"",
Some(nav_path.clone()),
None,
);
}
}
}
}
if let Some(ref nav) = nav_href {
let nav_path = opf.base_dir.join(nav);
if let Ok(nav_text) = fs::read_to_string(&nav_path) {
for remote in find_remote_links(&nav_text) {
report.emit_at(
"R5.5",
format!("Nav document references '{}'.", remote),
Some(nav_path.clone()),
None,
);
}
}
}
if let Some(ref ncx) = ncx_href {
let ncx_path = opf.base_dir.join(ncx);
if let Ok(ncx_text) = fs::read_to_string(&ncx_path) {
for remote in find_remote_links(&ncx_text) {
report.emit_at(
"R5.5",
format!("NCX references '{}'.", remote),
Some(ncx_path.clone()),
None,
);
}
}
}
if let Some(ref nav) = nav_href {
let nav_path = opf.base_dir.join(nav);
if let Ok(nav_text) = fs::read_to_string(&nav_path) {
if let Some(msg) = nav_toc_out_of_spine_order(&nav_text, &opf.spine_items) {
report.emit_at("R5.6", msg, Some(nav_path.clone()), None);
}
}
}
if let Some(ref ncx) = ncx_href {
let ncx_path = opf.base_dir.join(ncx);
if let Ok(ncx_text) = fs::read_to_string(&ncx_path) {
if let Some(uid_raw) = extract_ncx_dtb_uid(&ncx_text) {
let uid_trimmed = uid_raw.trim();
if uid_trimmed.len() != uid_raw.len() {
report.emit_at(
"R5.8",
format!("dtb:uid is '{}'.", uid_raw),
Some(ncx_path.clone()),
None,
);
}
if let Some(opf_id) = opf_unique_identifier_value(&opf_text) {
if uid_trimmed != opf_id.trim() {
report.emit_at(
"R5.7",
format!(
"NCX dtb:uid '{}' does not match OPF dc:identifier '{}'.",
uid_trimmed, opf_id
),
Some(ncx_path.clone()),
None,
);
}
}
}
for empty_line in find_empty_navpoint_text(&ncx_text) {
report.emit_at(
"R5.9",
"",
Some(ncx_path.clone()),
Some(empty_line),
);
}
}
}
for (href, _title) in parse_guide_references(&opf_text) {
let file_part = strip_fragment(&href);
if file_part.is_empty() {
continue;
}
match href_to_media.get(&file_part) {
None => {
}
Some(mt) => {
if mt != "application/xhtml+xml"
&& mt != "application/x-dtbncx+xml"
{
report.emit_at(
"R5.10",
format!(
"Guide reference '{}' has media-type '{}', not application/xhtml+xml.",
href, mt
),
Some(epub.opf_path.clone()),
None,
);
}
}
}
}
if let Some(toc_id) = extract_spine_toc_idref(&opf_text) {
match opf.manifest.get(&toc_id) {
Some((_href, media_type)) => {
if media_type != "application/x-dtbncx+xml" {
report.emit_at(
"R5.11",
format!(
"spine toc='{}' targets manifest item with media-type '{}'.",
toc_id, media_type
),
Some(epub.opf_path.clone()),
None,
);
}
}
None => {
report.emit_at(
"R5.11",
format!("spine toc='{}' does not match any manifest item.", toc_id),
Some(epub.opf_path.clone()),
None,
);
}
}
}
}
}
fn find_nav_doc_href(epub: &ExtractedEpub) -> Option<String> {
let opf_text = fs::read_to_string(&epub.opf_path).ok()?;
let mut rest = opf_text.as_str();
while let Some(idx) = rest.find("<item") {
rest = &rest[idx + "<item".len()..];
let Some(end) = rest.find('>') else { break };
let tag = &rest[..end];
let properties = extract_attr(tag, "properties").unwrap_or_default();
if properties.split_whitespace().any(|p| p == "nav") {
if let Some(href) = extract_attr(tag, "href") {
return Some(href);
}
}
rest = &rest[end..];
}
None
}
fn spine_has_pagebreak(epub: &ExtractedEpub) -> bool {
let opf = &epub.opf;
for (_, href) in &opf.spine_items {
let full = opf.base_dir.join(href);
if let Ok(text) = fs::read_to_string(&full) {
if contains_pagebreak_epub_type(&text) {
return true;
}
}
}
false
}
fn contains_pagebreak_epub_type(text: &str) -> bool {
for needle in &["epub:type=\"", "epub:type='"] {
let mut rest = text;
while let Some(idx) = rest.find(needle) {
rest = &rest[idx + needle.len()..];
let quote = needle.chars().last().unwrap();
if let Some(end) = rest.find(quote) {
let value = &rest[..end];
if value.split_whitespace().any(|t| t == "pagebreak") {
return true;
}
rest = &rest[end..];
} else {
break;
}
}
}
false
}
fn nav_has_page_list(nav_text: &str) -> bool {
let mut rest = nav_text;
while let Some(idx) = rest.find("<nav") {
rest = &rest[idx + "<nav".len()..];
let Some(end) = rest.find('>') else { break };
let tag = &rest[..end];
let value = extract_attr(tag, "epub:type").unwrap_or_default();
if value.split_whitespace().any(|t| t == "page-list") {
return true;
}
rest = &rest[end..];
}
false
}
fn find_remote_links(text: &str) -> Vec<String> {
let mut out = Vec::new();
for attr in &["href", "src"] {
for quote in &["\"", "'"] {
let needle = format!("{}={}", attr, quote);
let mut rest = text;
while let Some(idx) = rest.find(&needle) {
rest = &rest[idx + needle.len()..];
if let Some(end) = rest.find(*quote) {
let url = &rest[..end];
if url.starts_with("http://") || url.starts_with("https://") {
out.push(url.to_string());
}
rest = &rest[end..];
} else {
break;
}
}
}
}
out
}
fn extract_nav_toc_hrefs(nav_text: &str) -> Vec<String> {
let mut out = Vec::new();
let mut rest = nav_text;
let mut toc_body_start: Option<usize> = None;
let mut offset = 0usize;
while let Some(idx) = rest.find("<nav") {
let abs_open_start = offset + idx;
rest = &rest[idx + "<nav".len()..];
offset = abs_open_start + "<nav".len();
let Some(end) = rest.find('>') else { return out };
let tag = &rest[..end];
let epub_type = extract_attr(tag, "epub:type").unwrap_or_default();
if epub_type.split_whitespace().any(|t| t == "toc") {
toc_body_start = Some(offset + end + 1);
break;
}
rest = &rest[end..];
offset += end;
}
let Some(start) = toc_body_start else { return out };
let remaining = &nav_text[start..];
let Some(end_idx) = remaining.find("</nav>") else { return out };
let body = &remaining[..end_idx];
let mut body_rest = body;
while let Some(idx) = body_rest.find("<a") {
body_rest = &body_rest[idx + "<a".len()..];
let Some(tag_end) = body_rest.find('>') else { break };
let tag = &body_rest[..tag_end];
if let Some(href) = extract_attr(tag, "href") {
let file_part = strip_fragment(&href);
if !file_part.is_empty() {
out.push(file_part);
}
}
body_rest = &body_rest[tag_end..];
}
out
}
fn nav_toc_out_of_spine_order(
nav_text: &str,
spine_items: &[(String, String)],
) -> Option<String> {
let nav_hrefs = extract_nav_toc_hrefs(nav_text);
if nav_hrefs.len() < 2 {
return None;
}
let mut spine_pos: HashMap<String, usize> = HashMap::new();
for (i, (_, href)) in spine_items.iter().enumerate() {
spine_pos.entry(href.clone()).or_insert(i);
}
let mut positions: Vec<(usize, String)> = Vec::new();
for h in &nav_hrefs {
if let Some(&p) = spine_pos.get(h) {
positions.push((p, h.clone()));
}
}
if positions.len() < 2 {
return None;
}
for window in positions.windows(2) {
let (a, b) = (&window[0], &window[1]);
if a.0 > b.0 {
return Some(format!(
"Nav TOC entry '{}' (spine pos {}) precedes '{}' (spine pos {}).",
a.1, a.0, b.1, b.0
));
}
}
None
}
fn extract_ncx_dtb_uid(ncx_text: &str) -> Option<String> {
let mut rest = ncx_text;
while let Some(idx) = rest.find("<meta") {
rest = &rest[idx + "<meta".len()..];
let Some(end) = rest.find('>') else { return None };
let tag = &rest[..end];
let name = extract_attr(tag, "name").unwrap_or_default();
if name == "dtb:uid" {
return extract_attr(tag, "content");
}
rest = &rest[end..];
}
None
}
fn find_empty_navpoint_text(ncx_text: &str) -> Vec<usize> {
let mut out = Vec::new();
let mut offset = 0usize;
while let Some(idx) = ncx_text[offset..].find("<navPoint") {
let abs_start = offset + idx;
let close_needle = "</navPoint>";
let body_end_rel = match ncx_text[abs_start..].find(close_needle) {
Some(e) => abs_start + e,
None => break,
};
let navpoint_body = &ncx_text[abs_start..body_end_rel];
let mut empty = false;
let mut text_line = line_of_offset(ncx_text, abs_start);
if let Some(text_idx) = navpoint_body.find("<text") {
let abs_text = abs_start + text_idx;
text_line = line_of_offset(ncx_text, abs_text);
let after_open = &ncx_text[abs_text + "<text".len()..];
if let Some(tag_end) = after_open.find('>') {
let open_tag = &after_open[..tag_end];
if open_tag.trim_end().ends_with('/') {
empty = true;
} else {
let rest_after_open = &after_open[tag_end + 1..];
if let Some(close_idx) = rest_after_open.find("</text>") {
let inner = &rest_after_open[..close_idx];
if inner.trim().is_empty() {
empty = true;
}
}
}
}
} else {
empty = true;
}
if empty {
out.push(text_line);
}
offset = body_end_rel + close_needle.len();
}
out
}
fn extract_spine_toc_idref(opf_text: &str) -> Option<String> {
let idx = opf_text.find("<spine")?;
let rest = &opf_text[idx + "<spine".len()..];
let end = rest.find('>')?;
let tag = &rest[..end];
extract_attr(tag, "toc")
}
fn opf_unique_identifier_value(opf_text: &str) -> Option<String> {
let pkg_idx = opf_text.find("<package")?;
let pkg_rest = &opf_text[pkg_idx + "<package".len()..];
let pkg_end = pkg_rest.find('>')?;
let pkg_tag = &pkg_rest[..pkg_end];
let unique_id = extract_attr(pkg_tag, "unique-identifier")?;
let mut rest = opf_text;
while let Some(idx) = rest.find("<dc:identifier") {
rest = &rest[idx + "<dc:identifier".len()..];
let Some(tag_end) = rest.find('>') else { return None };
let tag = &rest[..tag_end];
let id_attr = extract_attr(tag, "id").unwrap_or_default();
if id_attr == unique_id {
let after_open = &rest[tag_end + 1..];
let close_idx = after_open.find("</dc:identifier>")?;
return Some(after_open[..close_idx].to_string());
}
rest = &rest[tag_end..];
}
let mut rest = opf_text;
while let Some(idx) = rest.find("<identifier") {
rest = &rest[idx + "<identifier".len()..];
let Some(tag_end) = rest.find('>') else { return None };
let tag = &rest[..tag_end];
let id_attr = extract_attr(tag, "id").unwrap_or_default();
if id_attr == unique_id {
let after_open = &rest[tag_end + 1..];
let close_idx = after_open.find("</identifier>")?;
return Some(after_open[..close_idx].to_string());
}
rest = &rest[tag_end..];
}
None
}
fn parse_guide_references(opf_text: &str) -> Vec<(String, String)> {
let mut out = Vec::new();
let lower = opf_text.to_ascii_lowercase();
let guide_start = match lower.find("<guide") {
Some(i) => i,
None => return out,
};
let guide_end = lower[guide_start..]
.find("</guide>")
.map(|e| guide_start + e)
.unwrap_or(opf_text.len());
let guide_block = &opf_text[guide_start..guide_end];
let mut rest = guide_block;
while let Some(idx) = rest.find("<reference") {
rest = &rest[idx + "<reference".len()..];
let Some(end) = rest.find('>') else { break };
let tag = &rest[..end];
if let Some(href) = extract_attr(tag, "href") {
let title = extract_attr(tag, "title").unwrap_or_default();
out.push((href, title));
}
rest = &rest[end..];
}
out
}
fn strip_fragment(href: &str) -> String {
match href.find('#') {
Some(i) => href[..i].to_string(),
None => href.to_string(),
}
}
fn line_of_offset(content: &str, byte_offset: usize) -> usize {
content[..byte_offset.min(content.len())]
.bytes()
.filter(|b| *b == b'\n')
.count()
+ 1
}
#[allow(dead_code)]
fn _unused_path_check(_: &Path, _: &PathBuf) {}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn r5_4_detects_pagebreak_epub_type() {
let s = r#"<body><span epub:type="pagebreak" id="p1"></span></body>"#;
assert!(contains_pagebreak_epub_type(s));
}
#[test]
fn r5_4_detects_tokenised_pagebreak() {
let s = r#"<span epub:type="front pagebreak"></span>"#;
assert!(contains_pagebreak_epub_type(s));
}
#[test]
fn r5_4_ignores_other_epub_types() {
let s = r#"<section epub:type="chapter"></section>"#;
assert!(!contains_pagebreak_epub_type(s));
}
#[test]
fn r5_4_page_list_detected() {
let nav = r#"<nav epub:type="page-list"><ol></ol></nav>"#;
assert!(nav_has_page_list(nav));
}
#[test]
fn r5_4_no_page_list_when_only_toc_nav() {
let nav = r#"<nav epub:type="toc"><ol></ol></nav>"#;
assert!(!nav_has_page_list(nav));
}
#[test]
fn r5_5_finds_http_href() {
let s = r#"<a href="http://example.com/foo">x</a>"#;
let urls = find_remote_links(s);
assert_eq!(urls, vec!["http://example.com/foo".to_string()]);
}
#[test]
fn r5_5_finds_https_src() {
let s = r#"<img src="https://cdn.example.com/a.png"/>"#;
let urls = find_remote_links(s);
assert_eq!(urls, vec!["https://cdn.example.com/a.png".to_string()]);
}
#[test]
fn r5_5_ignores_relative_hrefs() {
let s = r#"<a href="chapter1.xhtml">x</a>"#;
assert!(find_remote_links(s).is_empty());
}
#[test]
fn r5_6_in_order_is_clean() {
let nav = r#"<nav epub:type="toc"><ol>
<li><a href="ch1.xhtml">One</a></li>
<li><a href="ch2.xhtml">Two</a></li>
<li><a href="ch3.xhtml">Three</a></li>
</ol></nav>"#;
let spine = vec![
("id1".to_string(), "ch1.xhtml".to_string()),
("id2".to_string(), "ch2.xhtml".to_string()),
("id3".to_string(), "ch3.xhtml".to_string()),
];
assert!(nav_toc_out_of_spine_order(nav, &spine).is_none());
}
#[test]
fn r5_6_out_of_order_fires() {
let nav = r#"<nav epub:type="toc"><ol>
<li><a href="ch2.xhtml">Two</a></li>
<li><a href="ch1.xhtml">One</a></li>
</ol></nav>"#;
let spine = vec![
("id1".to_string(), "ch1.xhtml".to_string()),
("id2".to_string(), "ch2.xhtml".to_string()),
];
assert!(nav_toc_out_of_spine_order(nav, &spine).is_some());
}
#[test]
fn r5_6_fragments_stripped() {
let nav = r#"<nav epub:type="toc"><ol>
<li><a href="ch1.xhtml#s1">A</a></li>
<li><a href="ch1.xhtml#s2">B</a></li>
<li><a href="ch2.xhtml">C</a></li>
</ol></nav>"#;
let spine = vec![
("id1".to_string(), "ch1.xhtml".to_string()),
("id2".to_string(), "ch2.xhtml".to_string()),
];
assert!(nav_toc_out_of_spine_order(nav, &spine).is_none());
}
#[test]
fn r5_7_extract_dtb_uid() {
let ncx = r#"<ncx><head><meta name="dtb:uid" content="bookuid-123"/></head></ncx>"#;
assert_eq!(extract_ncx_dtb_uid(ncx).as_deref(), Some("bookuid-123"));
}
#[test]
fn r5_7_opf_unique_identifier_value() {
let opf = r#"<?xml version="1.0"?><package unique-identifier="BookId">
<metadata><dc:identifier id="BookId">urn:uuid:abc-123</dc:identifier></metadata>
</package>"#;
assert_eq!(
opf_unique_identifier_value(opf).as_deref(),
Some("urn:uuid:abc-123")
);
}
#[test]
fn r5_7_missing_unique_identifier_attr() {
let opf = r#"<package><metadata></metadata></package>"#;
assert!(opf_unique_identifier_value(opf).is_none());
}
#[test]
fn r5_8_whitespace_in_dtb_uid() {
let s = " bookid ";
assert_ne!(s.trim().len(), s.len());
}
#[test]
fn r5_9_detects_empty_self_closing_text() {
let ncx = "<navPoint>\n<navLabel><text/></navLabel>\n</navPoint>";
let lines = find_empty_navpoint_text(ncx);
assert_eq!(lines.len(), 1);
}
#[test]
fn r5_9_detects_empty_text_element() {
let ncx = "<navPoint>\n<navLabel><text></text></navLabel>\n</navPoint>";
let lines = find_empty_navpoint_text(ncx);
assert_eq!(lines.len(), 1);
}
#[test]
fn r5_9_ignores_populated_text() {
let ncx = "<navPoint>\n<navLabel><text>Chapter 1</text></navLabel>\n</navPoint>";
assert!(find_empty_navpoint_text(ncx).is_empty());
}
#[test]
fn r5_10_parse_guide_references() {
let opf = r#"<package><guide>
<reference type="toc" title="TOC" href="toc.xhtml"/>
<reference type="cover" href="cover.xhtml"/>
</guide></package>"#;
let refs = parse_guide_references(opf);
assert_eq!(refs.len(), 2);
assert_eq!(refs[0].0, "toc.xhtml");
assert_eq!(refs[1].0, "cover.xhtml");
}
#[test]
fn r5_11_extract_spine_toc_idref() {
let opf = r#"<package><spine toc="ncx"><itemref idref="ch1"/></spine></package>"#;
assert_eq!(extract_spine_toc_idref(opf).as_deref(), Some("ncx"));
}
#[test]
fn r5_11_no_toc_attribute() {
let opf = r#"<package><spine><itemref idref="ch1"/></spine></package>"#;
assert!(extract_spine_toc_idref(opf).is_none());
}
}