use std::fs;
use std::path::PathBuf;
use super::Check;
use crate::extracted::ExtractedEpub;
use crate::validate::ValidationReport;
pub struct ParseEncodingChecks;
impl Check for ParseEncodingChecks {
fn ids(&self) -> &'static [&'static str] {
&[
"R6.6", "R6.7", "R6.8", "R6.9", "R6.10", "R6.11", "R6.12",
]
}
fn run(&self, epub: &ExtractedEpub, report: &mut ValidationReport) {
let opf = &epub.opf;
for (_id, (href, media_type)) in &opf.manifest {
let full = opf.base_dir.join(href);
let bytes = match fs::read(&full) {
Ok(b) => b,
Err(_) => continue,
};
if is_xhtml_media_type(media_type) {
scan_xhtml(href, &bytes, report);
} else if is_css_media_type(media_type) {
scan_css(href, &bytes, report);
}
}
if let Ok(opf_bytes) = fs::read(&epub.opf_path) {
let href = epub
.opf_path
.file_name()
.map(|n| n.to_string_lossy().to_string())
.unwrap_or_else(|| "content.opf".to_string());
scan_opf_entities(&href, &opf_bytes, report);
}
}
}
fn is_xhtml_media_type(mt: &str) -> bool {
let l = mt.to_ascii_lowercase();
l == "application/xhtml+xml" || l == "text/html"
}
fn is_css_media_type(mt: &str) -> bool {
let l = mt.to_ascii_lowercase();
l == "text/css"
}
fn scan_xhtml(href: &str, bytes: &[u8], report: &mut ValidationReport) {
let file = Some(PathBuf::from(href));
check_html_encoding(href, bytes, report);
let content_bytes = strip_utf8_bom(bytes);
let content = match std::str::from_utf8(content_bytes) {
Ok(s) => s.to_string(),
Err(_) => String::from_utf8_lossy(content_bytes).to_string(),
};
if let Some(ver) = xml_declaration_version(&content) {
if ver != "1.0" {
report.emit_at(
"R6.6",
format!("XML declaration sets version=\"{}\".", ver),
file.clone(),
None,
);
}
}
if has_external_entity(&content) {
report.emit_at("R6.7", "", file.clone(), None);
}
if let Some(dt) = extract_doctype(&content) {
if !is_canonical_doctype(&dt) {
let trimmed: String = dt.chars().take(120).collect();
report.emit_at(
"R6.8",
format!("Found: <!DOCTYPE{}...", trimmed),
file.clone(),
None,
);
}
}
if let Some(bad) = wrong_epub_namespace(&content) {
report.emit_at(
"R6.9",
format!("Found xmlns:epub=\"{}\".", bad),
file.clone(),
None,
);
}
if let Some(name) = find_undeclared_entity(&content) {
report.emit_at(
"R6.10",
format!("Entity: &{};", name),
file.clone(),
None,
);
}
}
fn scan_css(href: &str, bytes: &[u8], report: &mut ValidationReport) {
let file = Some(PathBuf::from(href));
if starts_with_utf16_bom(bytes) {
report.emit_at("R6.12", "File begins with a UTF-16 BOM.", file.clone(), None);
return;
}
let content_bytes = strip_utf8_bom(bytes);
let content = String::from_utf8_lossy(content_bytes);
if let Some(cs) = css_charset(&content) {
if !cs.eq_ignore_ascii_case("utf-8") {
report.emit_at(
"R6.12",
format!("@charset is \"{}\".", cs),
file.clone(),
None,
);
}
}
}
fn scan_opf_entities(href: &str, bytes: &[u8], report: &mut ValidationReport) {
let content_bytes = strip_utf8_bom(bytes);
let content = String::from_utf8_lossy(content_bytes);
if has_external_entity(&content) {
report.emit_at("R6.7", "", Some(PathBuf::from(href)), None);
}
}
fn starts_with_utf16_bom(bytes: &[u8]) -> bool {
bytes.starts_with(&[0xFF, 0xFE]) || bytes.starts_with(&[0xFE, 0xFF])
}
fn strip_utf8_bom(bytes: &[u8]) -> &[u8] {
if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
&bytes[3..]
} else {
bytes
}
}
fn check_html_encoding(href: &str, bytes: &[u8], report: &mut ValidationReport) {
let file = Some(PathBuf::from(href));
if starts_with_utf16_bom(bytes) {
report.emit_at("R6.11", "File begins with a UTF-16 BOM.", file, None);
return;
}
let content_bytes = strip_utf8_bom(bytes);
let content = String::from_utf8_lossy(content_bytes);
if let Some(enc) = xml_declaration_encoding(&content) {
if !enc.eq_ignore_ascii_case("utf-8") {
report.emit_at(
"R6.11",
format!("XML declaration encoding is \"{}\".", enc),
file,
None,
);
}
}
}
fn xml_declaration_body(content: &str) -> Option<&str> {
let trimmed = content.trim_start();
let rest = trimmed.strip_prefix("<?xml")?;
let end = rest.find("?>")?;
Some(&rest[..end])
}
fn xml_declaration_version(content: &str) -> Option<String> {
let body = xml_declaration_body(content)?;
extract_pseudo_attr(body, "version")
}
fn xml_declaration_encoding(content: &str) -> Option<String> {
let body = xml_declaration_body(content)?;
extract_pseudo_attr(body, "encoding")
}
fn extract_pseudo_attr(body: &str, name: &str) -> Option<String> {
let needle = format!("{}=", name);
let idx = body.find(&needle)?;
let rest = &body[idx + needle.len()..];
let quote = rest.chars().next()?;
if quote != '"' && quote != '\'' {
return None;
}
let rest = &rest[1..];
let end = rest.find(quote)?;
Some(rest[..end].to_string())
}
fn has_external_entity(content: &str) -> bool {
let lower = content.to_ascii_lowercase();
let mut pos = 0usize;
while let Some(idx) = lower[pos..].find("<!entity") {
let start = pos + idx;
let end = match lower[start..].find('>') {
Some(e) => start + e,
None => break,
};
let body = &lower[start..end];
if body.contains(" system") || body.contains(" public") {
return true;
}
pos = end + 1;
}
false
}
fn extract_doctype(content: &str) -> Option<String> {
let lower = content.to_ascii_lowercase();
let idx = lower.find("<!doctype")?;
let start = idx + "<!doctype".len();
let rest = &content[start..];
let mut depth: i32 = 0;
let mut end_idx: Option<usize> = None;
for (i, c) in rest.char_indices() {
match c {
'[' => depth += 1,
']' => depth -= 1,
'>' if depth <= 0 => {
end_idx = Some(i);
break;
}
_ => {}
}
}
let end = end_idx?;
Some(rest[..end].to_string())
}
fn is_canonical_doctype(doctype_body: &str) -> bool {
let body = doctype_body.trim();
if body.eq_ignore_ascii_case("html") {
return true;
}
let lower = body.to_ascii_lowercase();
let known_dtds = [
"dtd/xhtml1-strict.dtd",
"dtd/xhtml1-transitional.dtd",
"dtd/xhtml1-frameset.dtd",
"dtd/xhtml11.dtd",
];
for dtd in known_dtds {
if lower.contains(dtd) {
return true;
}
}
false
}
fn wrong_epub_namespace(content: &str) -> Option<String> {
const NEEDLES: [&str; 2] = ["xmlns:epub=\"", "xmlns:epub='"];
for needle in NEEDLES {
if let Some(idx) = content.find(needle) {
let rest = &content[idx + needle.len()..];
let quote = needle.chars().last().unwrap();
if let Some(end) = rest.find(quote) {
let uri = &rest[..end];
if uri != "http://www.idpf.org/2007/ops" {
return Some(uri.to_string());
}
}
}
}
None
}
const ALLOWED_ENTITIES: &[&str] = &[
"amp", "lt", "gt", "quot", "apos",
"nbsp", "copy", "reg", "trade", "ndash", "mdash", "hellip", "lsquo",
"rsquo", "ldquo", "rdquo", "bull",
];
fn find_undeclared_entity(content: &str) -> Option<String> {
let bytes = content.as_bytes();
let mut i = 0usize;
while i < bytes.len() {
if bytes[i] != b'&' {
i += 1;
continue;
}
if i + 1 < bytes.len() && bytes[i + 1] == b'#' {
i += 1;
continue;
}
let start = i + 1;
let mut j = start;
if j >= bytes.len() || !bytes[j].is_ascii_alphabetic() {
i += 1;
continue;
}
while j < bytes.len() && bytes[j].is_ascii_alphanumeric() {
j += 1;
}
if j >= bytes.len() || bytes[j] != b';' {
i = j;
continue;
}
let name = std::str::from_utf8(&bytes[start..j]).unwrap_or("");
if !ALLOWED_ENTITIES.contains(&name) {
return Some(name.to_string());
}
i = j + 1;
}
None
}
fn css_charset(content: &str) -> Option<String> {
let trimmed = content.trim_start();
let rest = trimmed.strip_prefix("@charset")?;
let rest = rest.trim_start();
let quote = rest.chars().next()?;
if quote != '"' && quote != '\'' {
return None;
}
let rest = &rest[1..];
let end = rest.find(quote)?;
Some(rest[..end].to_string())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn r6_6_xml_11_declaration_fires() {
let s = "<?xml version=\"1.1\" encoding=\"UTF-8\"?><html/>";
assert_eq!(xml_declaration_version(s).as_deref(), Some("1.1"));
}
#[test]
fn r6_6_xml_10_declaration_clean() {
let s = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><html/>";
assert_eq!(xml_declaration_version(s).as_deref(), Some("1.0"));
}
#[test]
fn r6_7_system_entity_fires() {
let s = "<!DOCTYPE foo [ <!ENTITY xxe SYSTEM \"file:///etc/passwd\"> ]><html/>";
assert!(has_external_entity(s));
}
#[test]
fn r6_7_public_entity_fires() {
let s = "<!DOCTYPE foo [ <!ENTITY bar PUBLIC \"-//X//B\" \"b.dtd\"> ]><html/>";
assert!(has_external_entity(s));
}
#[test]
fn r6_7_no_entity_clean() {
let s = "<?xml version=\"1.0\"?><!DOCTYPE html><html/>";
assert!(!has_external_entity(s));
}
#[test]
fn r6_8_html5_doctype_canonical() {
let dt = extract_doctype("<!DOCTYPE html><html/>").unwrap();
assert!(is_canonical_doctype(&dt));
}
#[test]
fn r6_8_xhtml1_strict_canonical() {
let s = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \
\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">";
let dt = extract_doctype(s).unwrap();
assert!(is_canonical_doctype(&dt));
}
#[test]
fn r6_8_irregular_doctype_fires() {
let s = "<!DOCTYPE potato SYSTEM \"potato.dtd\">";
let dt = extract_doctype(s).unwrap();
assert!(!is_canonical_doctype(&dt));
}
#[test]
fn r6_8_no_doctype_ignored() {
assert!(extract_doctype("<html/>").is_none());
}
#[test]
fn r6_9_wrong_namespace_fires() {
let s = "<html xmlns:epub=\"http://www.example.com/ops\"/>";
assert_eq!(
wrong_epub_namespace(s).as_deref(),
Some("http://www.example.com/ops")
);
}
#[test]
fn r6_9_correct_namespace_clean() {
let s = "<html xmlns:epub=\"http://www.idpf.org/2007/ops\"/>";
assert!(wrong_epub_namespace(s).is_none());
}
#[test]
fn r6_9_no_epub_namespace_clean() {
let s = "<html xmlns=\"http://www.w3.org/1999/xhtml\"/>";
assert!(wrong_epub_namespace(s).is_none());
}
#[test]
fn r6_10_allowed_entities_clean() {
let s = "<p>a & b c — d …</p>";
assert!(find_undeclared_entity(s).is_none());
}
#[test]
fn r6_10_unknown_entity_fires() {
let s = "<p>Hello &foo; world</p>";
assert_eq!(find_undeclared_entity(s).as_deref(), Some("foo"));
}
#[test]
fn r6_10_numeric_references_clean() {
let s = "<p>— —</p>";
assert!(find_undeclared_entity(s).is_none());
}
#[test]
fn r6_11_utf16le_bom_detected() {
let bytes = [0xFF, 0xFE, 0x3C, 0x00];
assert!(starts_with_utf16_bom(&bytes));
}
#[test]
fn r6_11_utf16be_bom_detected() {
let bytes = [0xFE, 0xFF, 0x00, 0x3C];
assert!(starts_with_utf16_bom(&bytes));
}
#[test]
fn r6_11_utf8_bom_clean() {
let bytes = [0xEF, 0xBB, 0xBF, b'<'];
assert!(!starts_with_utf16_bom(&bytes));
assert_eq!(strip_utf8_bom(&bytes), &[b'<']);
}
#[test]
fn r6_11_non_utf8_encoding_fires() {
let s = "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?><html/>";
assert_eq!(
xml_declaration_encoding(s).as_deref(),
Some("ISO-8859-1")
);
}
#[test]
fn r6_11_utf8_encoding_clean() {
let s = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><html/>";
assert!(xml_declaration_encoding(s)
.unwrap()
.eq_ignore_ascii_case("utf-8"));
}
#[test]
fn r6_12_bad_charset_fires() {
let s = "@charset \"ISO-8859-1\";\nbody { color: red; }";
assert_eq!(css_charset(s).as_deref(), Some("ISO-8859-1"));
}
#[test]
fn r6_12_utf8_charset_clean() {
let s = "@charset \"UTF-8\";\nbody { color: red; }";
assert!(css_charset(s).unwrap().eq_ignore_ascii_case("utf-8"));
}
#[test]
fn r6_12_no_charset_clean() {
let s = "body { color: red; }";
assert!(css_charset(s).is_none());
}
}