use std::collections::{HashMap, HashSet};
use std::fs;
use std::path::PathBuf;
use super::Check;
use crate::extracted::ExtractedEpub;
use crate::validate::ValidationReport;
pub struct OpfGrammarChecks;
impl Check for OpfGrammarChecks {
fn ids(&self) -> &'static [&'static str] {
&[
"R8.1", "R8.2", "R8.3", "R8.4", "R8.5", "R8.6", "R8.7", "R8.8", "R8.9", "R8.10",
]
}
fn run(&self, epub: &ExtractedEpub, report: &mut ValidationReport) {
if epub.opf.package_version != "3.0" {
return;
}
let opf_bytes = match fs::read_to_string(&epub.opf_path) {
Ok(s) => s,
Err(_) => return,
};
let declared_prefixes = scan_package_prefix_attr(&opf_bytes, report);
let manifest_items = parse_manifest_items(&opf_bytes);
run_property_grammar_rules(epub, &manifest_items, &declared_prefixes, report);
run_content_vs_property_rules(epub, &manifest_items, report);
}
}
const RESERVED_PREFIXES: &[&str] = &[
"dcterms", "epub", "marc", "media", "onix", "opf", "rendition", "schema", "xsd",
];
const KNOWN_UNPREFIXED_PROPERTIES: &[&str] = &[
"cover-image",
"mathml",
"nav",
"remote-resources",
"scripted",
"svg",
"switch",
"coverimage",
];
const DEFAULT_PREFIXES: &[&str] = &[
"dcterms", "marc", "media", "onix", "rendition", "schema", "xsd", "opf", "epub",
];
fn reserved_prefix_canonical(name: &str) -> Option<&'static str> {
match name {
"dcterms" => Some("http://purl.org/dc/terms/"),
"epub" => Some("http://www.idpf.org/2007/ops"),
"marc" => Some("http://id.loc.gov/vocabulary/"),
"media" => Some("http://www.idpf.org/epub/vocab/overlays/#"),
"onix" => Some("http://www.editeur.org/ONIX/book/codelists/current.html#"),
"opf" => Some("http://www.idpf.org/2007/opf"),
"rendition" => Some("http://www.idpf.org/vocab/rendition/#"),
"schema" => Some("http://schema.org/"),
"xsd" => Some("http://www.w3.org/2001/XMLSchema#"),
_ => None,
}
}
fn scan_package_prefix_attr(
opf_text: &str,
report: &mut ValidationReport,
) -> HashMap<String, String> {
let mut out: HashMap<String, String> = HashMap::new();
let Some(package_open) = find_package_open_tag(opf_text) else {
return out;
};
let Some(prefix_value) = extract_attr_value(package_open, "prefix") else {
return out;
};
let tokens: Vec<&str> = prefix_value.split_whitespace().collect();
if tokens.is_empty() {
return out;
}
if tokens.len() % 2 != 0 {
report.emit(
"R8.1",
format!("prefix=\"{}\" has an odd number of tokens.", prefix_value),
);
return out;
}
let mut seen: HashSet<String> = HashSet::new();
let mut i = 0;
while i + 1 < tokens.len() {
let name_tok = tokens[i];
let uri_tok = tokens[i + 1];
i += 2;
if !name_tok.ends_with(':') || name_tok.len() < 2 {
report.emit(
"R8.1",
format!("'{}' is not a valid prefix name (expected 'name:').", name_tok),
);
continue;
}
let name = &name_tok[..name_tok.len() - 1];
if !is_valid_ncname(name) {
report.emit(
"R8.1",
format!("'{}' is not a valid NCName for a prefix.", name),
);
continue;
}
if !is_valid_uri(uri_tok) {
report.emit(
"R8.4",
format!("Prefix '{}' maps to malformed URI '{}'.", name, uri_tok),
);
continue;
}
if !seen.insert(name.to_string()) {
report.emit(
"R8.2",
format!("Prefix '{}' is declared more than once.", name),
);
continue;
}
if RESERVED_PREFIXES.contains(&name) {
let canonical = reserved_prefix_canonical(name).unwrap_or("");
if !canonical.is_empty() && uri_tok != canonical {
report.emit(
"R8.3",
format!(
"Reserved prefix '{}' rebound to '{}'; canonical is '{}'.",
name, uri_tok, canonical
),
);
continue;
}
}
out.insert(name.to_string(), uri_tok.to_string());
}
out
}
fn find_package_open_tag(text: &str) -> Option<&str> {
let lower = text.to_ascii_lowercase();
let idx = lower.find("<package")?;
let after = idx + "<package".len();
let rest = &text[after..];
let end = rest.find('>')?;
Some(&rest[..end])
}
fn extract_attr_value(tag_body: &str, attr: &str) -> Option<String> {
let needle_eq = format!("{}=", attr);
let start = tag_body.find(&needle_eq)? + needle_eq.len();
let rest = &tag_body[start..];
let quote = rest.chars().next()?;
if quote != '"' && quote != '\'' {
return None;
}
let rest = &rest[1..];
let end = rest.find(quote)?;
Some(rest[..end].to_string())
}
fn is_valid_ncname(s: &str) -> bool {
let mut chars = s.chars();
let Some(first) = chars.next() else { return false };
if !first.is_ascii_alphabetic() && first != '_' {
return false;
}
for c in chars {
if !(c.is_ascii_alphanumeric() || c == '_' || c == '-' || c == '.') {
return false;
}
}
true
}
fn is_valid_uri(s: &str) -> bool {
let Some(colon) = s.find(':') else { return false };
if colon == 0 || colon + 1 >= s.len() {
return false;
}
let scheme = &s[..colon];
let mut scheme_chars = scheme.chars();
let Some(first) = scheme_chars.next() else { return false };
if !first.is_ascii_alphabetic() {
return false;
}
for c in scheme_chars {
if !(c.is_ascii_alphanumeric() || c == '+' || c == '-' || c == '.') {
return false;
}
}
true
}
#[derive(Debug, Clone)]
struct ManifestItem {
id: String,
href: String,
media_type: String,
properties: Vec<String>,
}
fn parse_manifest_items(opf_text: &str) -> Vec<ManifestItem> {
let mut out = Vec::new();
let lower = opf_text.to_ascii_lowercase();
let Some(manifest_start) = lower.find("<manifest") else {
return out;
};
let manifest_end = lower[manifest_start..]
.find("</manifest>")
.map(|e| manifest_start + e)
.unwrap_or(opf_text.len());
let body = &opf_text[manifest_start..manifest_end];
let mut rest = body;
while let Some(idx) = rest.find("<item") {
rest = &rest[idx + "<item".len()..];
let Some(end) = rest.find('>') else { break };
let tag = &rest[..end];
if tag.starts_with("ref") || tag.starts_with("ref/") {
rest = &rest[end..];
continue;
}
let id = extract_attr_value(tag, "id").unwrap_or_default();
let href = extract_attr_value(tag, "href").unwrap_or_default();
let media_type = extract_attr_value(tag, "media-type").unwrap_or_default();
let props_raw = extract_attr_value(tag, "properties").unwrap_or_default();
let properties: Vec<String> = props_raw
.split_whitespace()
.map(|s| s.to_string())
.collect();
if !id.is_empty() || !href.is_empty() {
out.push(ManifestItem { id, href, media_type, properties });
}
rest = &rest[end..];
}
out
}
fn allowed_media_types(property: &str) -> Option<&'static [&'static str]> {
match property {
"cover-image" | "coverimage" => Some(&[
"image/jpeg",
"image/png",
"image/gif",
"image/svg+xml",
"image/webp",
]),
"nav" => Some(&["application/xhtml+xml"]),
"mathml" | "scripted" | "remote-resources" => {
Some(&["application/xhtml+xml", "image/svg+xml"])
}
"svg" => Some(&["application/xhtml+xml"]),
_ => None,
}
}
fn run_property_grammar_rules(
epub: &ExtractedEpub,
items: &[ManifestItem],
declared_prefixes: &HashMap<String, String>,
report: &mut ValidationReport,
) {
let _ = epub;
for item in items {
let file = if item.href.is_empty() {
None
} else {
Some(PathBuf::from(&item.href))
};
for prop in &item.properties {
if !is_valid_property_token(prop) {
report.emit_at(
"R8.8",
format!("Property '{}' on item '{}' is malformed.", prop, item.id),
file.clone(),
None,
);
continue;
}
if let Some((prefix, local)) = split_prefixed_property(prop) {
if !DEFAULT_PREFIXES.contains(&prefix)
&& !declared_prefixes.contains_key(prefix)
{
report.emit_at(
"R8.10",
format!(
"Property '{}' uses undeclared prefix '{}' on item '{}'.",
prop, prefix, item.id
),
file.clone(),
None,
);
continue;
}
let _ = local;
} else {
if !KNOWN_UNPREFIXED_PROPERTIES.contains(&prop.as_str()) {
report.emit_at(
"R8.9",
format!(
"Unknown property '{}' on item '{}' has no declared prefix.",
prop, item.id
),
file.clone(),
None,
);
continue;
}
if let Some(allowed) = allowed_media_types(prop) {
let mt = item.media_type.to_ascii_lowercase();
if !allowed.iter().any(|a| a.eq_ignore_ascii_case(&mt)) {
report.emit_at(
"R8.5",
format!(
"Property '{}' not permitted on media-type '{}' \
(item '{}').",
prop, item.media_type, item.id
),
file.clone(),
None,
);
}
}
}
}
}
}
fn is_valid_property_token(prop: &str) -> bool {
if prop.is_empty() {
return false;
}
if let Some((prefix, local)) = split_prefixed_property(prop) {
is_valid_ncname(prefix) && is_valid_property_local(local)
} else {
is_valid_property_local(prop)
}
}
fn is_valid_property_local(s: &str) -> bool {
is_valid_ncname(s)
}
fn split_prefixed_property(prop: &str) -> Option<(&str, &str)> {
let idx = prop.find(':')?;
Some((&prop[..idx], &prop[idx + 1..]))
}
#[derive(Debug, Default, Clone)]
struct ContentFeatures {
has_mathml: bool,
has_svg: bool,
has_scripted: bool,
has_remote_resources: bool,
}
fn run_content_vs_property_rules(
epub: &ExtractedEpub,
items: &[ManifestItem],
report: &mut ValidationReport,
) {
let by_id: HashMap<String, &ManifestItem> =
items.iter().map(|i| (i.id.clone(), i)).collect();
for (id, _href) in &epub.opf.spine_items {
let Some(item) = by_id.get(id) else { continue };
if !is_xhtml_media_type(&item.media_type) {
continue;
}
let full = epub.opf.base_dir.join(&item.href);
let Ok(bytes) = fs::read(&full) else { continue };
let features = detect_content_features(&bytes);
let declared: HashSet<&str> = item.properties.iter().map(|s| s.as_str()).collect();
let file = Some(PathBuf::from(&item.href));
if features.has_mathml && !declared.contains("mathml") {
report.emit_at(
"R8.6",
format!("Item '{}' uses MathML but does not declare properties=\"mathml\".", id),
file.clone(),
None,
);
}
if features.has_svg && !declared.contains("svg") {
report.emit_at(
"R8.6",
format!("Item '{}' uses SVG but does not declare properties=\"svg\".", id),
file.clone(),
None,
);
}
if features.has_scripted && !declared.contains("scripted") {
report.emit_at(
"R8.6",
format!("Item '{}' contains scripts but does not declare properties=\"scripted\".", id),
file.clone(),
None,
);
}
if features.has_remote_resources && !declared.contains("remote-resources") {
report.emit_at(
"R8.6",
format!(
"Item '{}' references remote resources but does not declare \
properties=\"remote-resources\".",
id
),
file.clone(),
None,
);
}
if declared.contains("mathml") && !features.has_mathml {
report.emit_at(
"R8.7",
format!("Item '{}' declares properties=\"mathml\" but no <math> is present.", id),
file.clone(),
None,
);
}
if declared.contains("svg") && !features.has_svg {
report.emit_at(
"R8.7",
format!("Item '{}' declares properties=\"svg\" but no <svg> is present.", id),
file.clone(),
None,
);
}
if declared.contains("scripted") && !features.has_scripted {
report.emit_at(
"R8.7",
format!("Item '{}' declares properties=\"scripted\" but no <script> is present.", id),
file.clone(),
None,
);
}
if declared.contains("remote-resources") && !features.has_remote_resources {
report.emit_at(
"R8.7",
format!(
"Item '{}' declares properties=\"remote-resources\" but no \
http(s) URL is present.",
id
),
file.clone(),
None,
);
}
}
}
fn is_xhtml_media_type(mt: &str) -> bool {
let l = mt.to_ascii_lowercase();
l == "application/xhtml+xml" || l == "text/html"
}
fn detect_content_features(bytes: &[u8]) -> ContentFeatures {
let text = std::str::from_utf8(bytes)
.map(|s| s.to_string())
.unwrap_or_else(|_| String::from_utf8_lossy(bytes).to_string());
let mut features = ContentFeatures::default();
features.has_mathml = has_element_start(&text, "math");
features.has_svg = has_element_start(&text, "svg");
features.has_scripted = has_element_start(&text, "script");
features.has_remote_resources = has_remote_attr(&text);
features
}
fn has_element_start(content: &str, name: &str) -> bool {
let needle_space = format!("<{} ", name);
let needle_close = format!("<{}>", name);
let needle_slash = format!("<{}/", name);
let needle_ns = format!("<{}:", name);
content.contains(&needle_space)
|| content.contains(&needle_close)
|| content.contains(&needle_slash)
|| content.contains(&needle_ns)
}
fn has_remote_attr(content: &str) -> bool {
for needle in &["src=\"http://", "src='http://", "href=\"http://", "href='http://",
"src=\"https://", "src='https://", "href=\"https://", "href='https://"]
{
if content.contains(needle) {
return true;
}
}
false
}
#[cfg(test)]
mod tests {
use super::*;
fn fresh_report() -> ValidationReport {
ValidationReport::new()
}
fn rule_ids(report: &ValidationReport) -> Vec<&'static str> {
report.findings.iter().filter_map(|f| f.rule_id).collect()
}
#[test]
fn r8_1_odd_token_count_fires() {
let opf = r#"<package version="3.0" prefix="foo: http://example.com extra">"#;
let mut report = fresh_report();
let _ = scan_package_prefix_attr(opf, &mut report);
assert!(rule_ids(&report).contains(&"R8.1"));
}
#[test]
fn r8_1_prefix_without_colon_fires() {
let opf = r#"<package version="3.0" prefix="foo http://example.com">"#;
let mut report = fresh_report();
let _ = scan_package_prefix_attr(opf, &mut report);
assert!(rule_ids(&report).contains(&"R8.1"));
}
#[test]
fn r8_1_well_formed_clean() {
let opf = r#"<package version="3.0" prefix="foo: http://example.com/foo">"#;
let mut report = fresh_report();
let map = scan_package_prefix_attr(opf, &mut report);
assert!(rule_ids(&report).is_empty());
assert_eq!(map.get("foo").map(String::as_str), Some("http://example.com/foo"));
}
#[test]
fn r8_2_duplicate_prefix_fires() {
let opf = r#"<package prefix="foo: http://a.example foo: http://b.example">"#;
let mut report = fresh_report();
let _ = scan_package_prefix_attr(opf, &mut report);
assert!(rule_ids(&report).contains(&"R8.2"));
}
#[test]
fn r8_3_reserved_prefix_rebound_fires() {
let opf = r#"<package prefix="epub: http://evil.example">"#;
let mut report = fresh_report();
let _ = scan_package_prefix_attr(opf, &mut report);
assert!(rule_ids(&report).contains(&"R8.3"));
}
#[test]
fn r8_3_reserved_prefix_canonical_clean() {
let opf = r#"<package prefix="epub: http://www.idpf.org/2007/ops">"#;
let mut report = fresh_report();
let _ = scan_package_prefix_attr(opf, &mut report);
assert!(!rule_ids(&report).contains(&"R8.3"));
}
#[test]
fn r8_4_garbage_uri_fires() {
let opf = r#"<package prefix="foo: not-a-uri">"#;
let mut report = fresh_report();
let _ = scan_package_prefix_attr(opf, &mut report);
assert!(rule_ids(&report).contains(&"R8.4"));
}
#[test]
fn r8_4_custom_scheme_clean() {
let opf = r#"<package prefix="foo: urn:isbn:9780000000001">"#;
let mut report = fresh_report();
let _ = scan_package_prefix_attr(opf, &mut report);
assert!(!rule_ids(&report).contains(&"R8.4"));
}
#[test]
fn r8_5_cover_image_on_non_image_fires() {
let items = vec![ManifestItem {
id: "x".into(),
href: "x.xhtml".into(),
media_type: "application/xhtml+xml".into(),
properties: vec!["cover-image".into()],
}];
let epub = dummy_epub();
let mut report = fresh_report();
run_property_grammar_rules(&epub, &items, &HashMap::new(), &mut report);
assert!(rule_ids(&report).contains(&"R8.5"));
}
#[test]
fn r8_5_nav_on_non_xhtml_fires() {
let items = vec![ManifestItem {
id: "n".into(),
href: "nav.css".into(),
media_type: "text/css".into(),
properties: vec!["nav".into()],
}];
let epub = dummy_epub();
let mut report = fresh_report();
run_property_grammar_rules(&epub, &items, &HashMap::new(), &mut report);
assert!(rule_ids(&report).contains(&"R8.5"));
}
#[test]
fn r8_5_mathml_on_xhtml_clean() {
let items = vec![ManifestItem {
id: "m".into(),
href: "c.xhtml".into(),
media_type: "application/xhtml+xml".into(),
properties: vec!["mathml".into()],
}];
let epub = dummy_epub();
let mut report = fresh_report();
run_property_grammar_rules(&epub, &items, &HashMap::new(), &mut report);
assert!(!rule_ids(&report).contains(&"R8.5"));
}
#[test]
fn r8_5_cover_image_on_jpeg_clean() {
let items = vec![ManifestItem {
id: "c".into(),
href: "cover.jpg".into(),
media_type: "image/jpeg".into(),
properties: vec!["cover-image".into()],
}];
let epub = dummy_epub();
let mut report = fresh_report();
run_property_grammar_rules(&epub, &items, &HashMap::new(), &mut report);
assert!(!rule_ids(&report).contains(&"R8.5"));
}
#[test]
fn r8_8_empty_like_token_fires() {
let items = vec![ManifestItem {
id: "x".into(),
href: "x.xhtml".into(),
media_type: "application/xhtml+xml".into(),
properties: vec!["1bad".into()],
}];
let epub = dummy_epub();
let mut report = fresh_report();
run_property_grammar_rules(&epub, &items, &HashMap::new(), &mut report);
assert!(rule_ids(&report).contains(&"R8.8"));
}
#[test]
fn r8_9_unknown_unprefixed_fires() {
let items = vec![ManifestItem {
id: "x".into(),
href: "x.xhtml".into(),
media_type: "application/xhtml+xml".into(),
properties: vec!["futuristic".into()],
}];
let epub = dummy_epub();
let mut report = fresh_report();
run_property_grammar_rules(&epub, &items, &HashMap::new(), &mut report);
assert!(rule_ids(&report).contains(&"R8.9"));
}
#[test]
fn r8_10_undeclared_prefix_fires() {
let items = vec![ManifestItem {
id: "x".into(),
href: "x.xhtml".into(),
media_type: "application/xhtml+xml".into(),
properties: vec!["myns:thing".into()],
}];
let epub = dummy_epub();
let mut report = fresh_report();
run_property_grammar_rules(&epub, &items, &HashMap::new(), &mut report);
assert!(rule_ids(&report).contains(&"R8.10"));
}
#[test]
fn r8_10_declared_prefix_clean() {
let items = vec![ManifestItem {
id: "x".into(),
href: "x.xhtml".into(),
media_type: "application/xhtml+xml".into(),
properties: vec!["myns:thing".into()],
}];
let mut declared = HashMap::new();
declared.insert("myns".to_string(), "http://example.com/ns".to_string());
let epub = dummy_epub();
let mut report = fresh_report();
run_property_grammar_rules(&epub, &items, &declared, &mut report);
assert!(!rule_ids(&report).contains(&"R8.10"));
}
#[test]
fn r8_10_default_prefix_clean() {
let items = vec![ManifestItem {
id: "x".into(),
href: "x.xhtml".into(),
media_type: "application/xhtml+xml".into(),
properties: vec!["rendition:layout".into()],
}];
let epub = dummy_epub();
let mut report = fresh_report();
run_property_grammar_rules(&epub, &items, &HashMap::new(), &mut report);
assert!(!rule_ids(&report).contains(&"R8.10"));
}
#[test]
fn ncname_accepts_simple_names() {
assert!(is_valid_ncname("foo"));
assert!(is_valid_ncname("Foo_bar"));
assert!(is_valid_ncname("foo-bar.baz"));
}
#[test]
fn ncname_rejects_digits_first_and_colons() {
assert!(!is_valid_ncname("1bad"));
assert!(!is_valid_ncname("a:b"));
assert!(!is_valid_ncname(""));
}
#[test]
fn uri_accepts_http_and_urn() {
assert!(is_valid_uri("http://example.com"));
assert!(is_valid_uri("https://example.com/foo"));
assert!(is_valid_uri("urn:isbn:9780000000001"));
}
#[test]
fn uri_rejects_naked_text() {
assert!(!is_valid_uri(""));
assert!(!is_valid_uri("not-a-uri"));
assert!(!is_valid_uri("://oops"));
}
#[test]
fn split_prefixed_property_works() {
assert_eq!(split_prefixed_property("rendition:layout"), Some(("rendition", "layout")));
assert_eq!(split_prefixed_property("cover-image"), None);
}
#[test]
fn detect_mathml_fires_on_math_element() {
let html = br#"<html><body><math xmlns="http://www.w3.org/1998/Math/MathML"><mi>x</mi></math></body></html>"#;
let f = detect_content_features(html);
assert!(f.has_mathml);
}
#[test]
fn detect_mathml_ignored_when_only_xmlns() {
let html = br#"<html xmlns:math="http://exslt.org/math"><body><p>no math here</p></body></html>"#;
let f = detect_content_features(html);
assert!(!f.has_mathml);
}
#[test]
fn detect_svg_fires_on_svg_element() {
let html = br#"<html><body><svg width="1"/></body></html>"#;
let f = detect_content_features(html);
assert!(f.has_svg);
}
#[test]
fn detect_script_fires_on_script_element() {
let html = br#"<html><body><script>alert(1)</script></body></html>"#;
let f = detect_content_features(html);
assert!(f.has_scripted);
}
#[test]
fn detect_remote_fires_on_http_src() {
let html = br#"<html><body><img src="https://cdn.example.com/a.jpg"/></body></html>"#;
let f = detect_content_features(html);
assert!(f.has_remote_resources);
}
#[test]
fn detect_remote_ignored_on_relative_href() {
let html = br#"<html><body><a href="other.xhtml">other</a></body></html>"#;
let f = detect_content_features(html);
assert!(!f.has_remote_resources);
}
fn dummy_epub() -> ExtractedEpub {
use crate::opf::OPFData;
use std::collections::HashMap;
use std::path::PathBuf;
let tmp = std::env::temp_dir().join(format!(
"kindling_opf_grammar_test_{}",
std::process::id()
));
let _ = std::fs::create_dir_all(&tmp);
let opf_path = tmp.join("x.opf");
let _ = std::fs::write(&opf_path, b"<package version=\"3.0\"></package>");
let opf = OPFData {
base_dir: tmp.clone(),
title: String::new(),
author: String::new(),
language: String::new(),
identifier: String::new(),
date: String::new(),
dict_in_language: String::new(),
dict_out_language: String::new(),
default_lookup_index: "default".into(),
spine_items: Vec::new(),
manifest: HashMap::new(),
coverimage_id: None,
is_fixed_layout: false,
original_resolution: None,
page_progression_direction: None,
package_version: "3.0".into(),
dc_types: Vec::new(),
manifest_items: Vec::new(),
raw_itemrefs: Vec::new(),
embedded_cover_href: None,
};
let _ = opf;
let _ = PathBuf::new();
ExtractedEpub::from_opf_path(&opf_path)
.expect("test OPFData should parse")
}
#[test]
fn parse_manifest_items_reads_properties() {
let opf = r#"<package>
<manifest>
<item id="a" href="a.xhtml" media-type="application/xhtml+xml" properties="nav"/>
<item id="b" href="b.jpg" media-type="image/jpeg" properties="cover-image"/>
</manifest>
</package>"#;
let items = parse_manifest_items(opf);
assert_eq!(items.len(), 2);
assert_eq!(items[0].properties, vec!["nav".to_string()]);
assert_eq!(items[1].properties, vec!["cover-image".to_string()]);
}
}