use std::collections::{HashMap, HashSet};
use std::fs;
use std::path::PathBuf;
use regex::Regex;
use super::Check;
use crate::extracted::ExtractedEpub;
use crate::profile::Profile;
use crate::validate::ValidationReport;
pub struct DictChecks;
impl Check for DictChecks {
fn ids(&self) -> &'static [&'static str] {
&[
"R15.1", "R15.2", "R15.3", "R15.4", "R15.5", "R15.6", "R15.7",
"R15.e1", "R15.e2", "R15.e3", "R15.e4", "R15.e5", "R15.e6", "R15.e7",
]
}
fn run(&self, epub: &ExtractedEpub, report: &mut ValidationReport) {
if epub.profile != Profile::Dict {
return;
}
run_legacy_rules(epub, report);
run_epub3_rules(epub, report);
}
}
fn run_legacy_rules(epub: &ExtractedEpub, report: &mut ValidationReport) {
let opf = &epub.opf;
if opf.dict_in_language.is_empty() {
report.emit("R15.1", "Missing or empty <DictionaryInLanguage>.");
} else if !is_valid_bcp47(&opf.dict_in_language) {
report.emit(
"R15.1",
format!("Value '{}' is not a valid BCP47 code.", opf.dict_in_language),
);
}
if opf.dict_out_language.is_empty() {
report.emit("R15.2", "Missing or empty <DictionaryOutLanguage>.");
} else if !is_valid_bcp47(&opf.dict_out_language) {
report.emit(
"R15.2",
format!("Value '{}' is not a valid BCP47 code.", opf.dict_out_language),
);
}
let scan = scan_spine_content(epub);
let default_index = if opf.default_lookup_index.is_empty() {
"default"
} else {
opf.default_lookup_index.as_str()
};
if !scan.idx_entry_names.is_empty() && !scan.idx_entry_names.contains(default_index) {
report.emit(
"R15.3",
format!(
"DefaultLookupIndex is '{}' but no <idx:entry name=\"{}\"> was found in spine \
content.",
default_index, default_index
),
);
}
if scan.idx_entry_count == 0 {
report.emit("R15.4", "");
}
for missing in &scan.spine_files_without_frameset {
report.emit_at(
"R15.5",
"",
Some(PathBuf::from(missing)),
None,
);
}
for (file, line) in &scan.empty_orth_locations {
report.emit_at("R15.6", "", Some(PathBuf::from(file)), Some(*line));
}
if !guide_has_index_reference(&epub.opf_path) {
report.emit("R15.7", "");
}
}
fn run_epub3_rules(epub: &ExtractedEpub, report: &mut ValidationReport) {
if epub.opf.package_version != "3.0" {
return;
}
let opf = &epub.opf;
let mut dict_typed_content_found = false;
for (_, href) in &opf.spine_items {
let full = opf.base_dir.join(href);
if let Ok(content) = fs::read_to_string(&full) {
if content.contains("epub:type=\"dictionary\"")
|| content.contains("epub:type='dictionary'")
{
dict_typed_content_found = true;
break;
}
}
}
if !dict_typed_content_found {
report.emit("R15.e1", "");
}
let has_dict_content = dict_typed_content_found
|| scan_spine_for_idx_entry(epub)
|| !opf.dict_in_language.is_empty();
let declares_dict_type = opf
.dc_types
.iter()
.any(|t| t.eq_ignore_ascii_case("dictionary"));
if has_dict_content && !declares_dict_type {
report.emit("R15.e2", "");
}
let collections = parse_dictionary_collections(&epub.opf_path);
let manifest_hrefs: HashSet<String> = opf
.manifest
.values()
.map(|(href, _)| href.clone())
.collect();
let href_to_media: HashMap<String, String> = opf
.manifest
.values()
.map(|(href, mt)| (href.clone(), mt.clone()))
.collect();
for collection in &collections {
let mut skm_count = 0usize;
for link in &collection.links {
if !manifest_hrefs.contains(link) {
report.emit(
"R15.e4",
format!("Collection references '{}' not in manifest.", link),
);
continue;
}
let media = href_to_media.get(link).cloned().unwrap_or_default();
let is_xhtml = media == "application/xhtml+xml";
let is_skm = media == "application/vnd.epub.search-key-map+xml"
|| link.to_lowercase().ends_with(".xml");
if is_skm && !is_xhtml {
skm_count += 1;
if !link.to_lowercase().ends_with(".xml") {
report.emit(
"R15.e3",
format!("Search Key Map '{}' does not use .xml extension.", link),
);
}
}
if !is_xhtml && !is_skm {
report.emit(
"R15.e7",
format!(
"Collection resource '{}' has media-type '{}', not XHTML or SKM.",
link, media
),
);
}
}
if skm_count > 1 {
report.emit(
"R15.e5",
format!("Collection contains {} Search Key Map documents.", skm_count),
);
}
if skm_count == 0 {
report.emit("R15.e6", "");
}
}
}
struct SpineScan {
idx_entry_count: usize,
idx_entry_names: HashSet<String>,
spine_files_without_frameset: Vec<String>,
empty_orth_locations: Vec<(String, usize)>,
}
fn scan_spine_content(epub: &ExtractedEpub) -> SpineScan {
let opf = &epub.opf;
let mut scan = SpineScan {
idx_entry_count: 0,
idx_entry_names: HashSet::new(),
spine_files_without_frameset: Vec::new(),
empty_orth_locations: Vec::new(),
};
for (_, href) in &opf.spine_items {
let full = opf.base_dir.join(href);
let content = match fs::read_to_string(&full) {
Ok(c) => c,
Err(_) => continue,
};
let entries_in_file = count_idx_entries(&content);
scan.idx_entry_count += entries_in_file;
for name in collect_idx_entry_names(&content) {
scan.idx_entry_names.insert(name);
}
if entries_in_file > 0 && !content.contains("<mbp:frameset") {
scan.spine_files_without_frameset.push(href.clone());
}
for line_no in find_empty_orth_lines(&content) {
scan.empty_orth_locations.push((href.clone(), line_no));
}
}
scan
}
fn scan_spine_for_idx_entry(epub: &ExtractedEpub) -> bool {
let opf = &epub.opf;
for (_, href) in &opf.spine_items {
let full = opf.base_dir.join(href);
if let Ok(content) = fs::read_to_string(&full) {
if content.contains("<idx:entry") {
return true;
}
}
}
false
}
fn count_idx_entries(content: &str) -> usize {
content.matches("<idx:entry").count()
}
fn collect_idx_entry_names(content: &str) -> Vec<String> {
let mut out = Vec::new();
let mut rest = content;
while let Some(idx) = rest.find("<idx:entry") {
rest = &rest[idx + "<idx:entry".len()..];
let Some(end) = rest.find('>') else { break };
let tag = &rest[..end];
if let Some(name) = extract_dq_attr(tag, "name") {
out.push(name);
}
rest = &rest[end..];
}
out
}
fn find_empty_orth_lines(content: &str) -> Vec<usize> {
let mut out = Vec::new();
let mut byte_pos = 0usize;
while let Some(idx) = content[byte_pos..].find("<idx:orth") {
let abs = byte_pos + idx;
let after_tag_name = abs + "<idx:orth".len();
let end = match content[after_tag_name..].find('>') {
Some(e) => after_tag_name + e,
None => break,
};
let tag = &content[after_tag_name..end];
let value = extract_dq_attr(tag, "value").unwrap_or_default();
let self_closing = tag.ends_with('/');
let body_has_text = if value.is_empty() && !self_closing {
let body_start = end + 1;
match content[body_start..].find("</idx:orth>") {
Some(rel) => has_non_whitespace_text(&content[body_start..body_start + rel]),
None => false,
}
} else {
false
};
if value.is_empty() && !body_has_text {
out.push(line_of(content, abs));
}
byte_pos = end + 1;
}
out
}
fn has_non_whitespace_text(body: &str) -> bool {
let mut in_tag = false;
for ch in body.chars() {
match ch {
'<' => in_tag = true,
'>' => in_tag = false,
c if !in_tag && !c.is_whitespace() => return true,
_ => {}
}
}
false
}
fn extract_dq_attr(tag: &str, attr: &str) -> Option<String> {
let needle = format!("{}=\"", attr);
let start = tag.find(&needle)? + needle.len();
let rest = &tag[start..];
let end = rest.find('"')?;
Some(rest[..end].to_string())
}
fn line_of(content: &str, byte_offset: usize) -> usize {
content[..byte_offset.min(content.len())]
.bytes()
.filter(|b| *b == b'\n')
.count()
+ 1
}
fn is_valid_bcp47(tag: &str) -> bool {
use std::sync::OnceLock;
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r"^[a-z]{2,3}(-[A-Z][a-z]{3})?(-[A-Z]{2}|-\d{3})?(-[a-z0-9]+)*$").unwrap()
});
re.is_match(tag)
}
fn guide_has_index_reference(opf_path: &std::path::Path) -> bool {
let content = match fs::read_to_string(opf_path) {
Ok(c) => c,
Err(_) => return false,
};
let lower = content.to_ascii_lowercase();
let guide_start = match lower.find("<guide") {
Some(i) => i,
None => return false,
};
let guide_end = lower[guide_start..]
.find("</guide>")
.map(|e| guide_start + e)
.unwrap_or(content.len());
let guide_block = &lower[guide_start..guide_end];
let mut rest = guide_block;
while let Some(idx) = rest.find("<reference") {
rest = &rest[idx + "<reference".len()..];
let Some(end) = rest.find('>') else { return false };
let tag = &rest[..end];
if tag.contains("type=\"index\"") || tag.contains("type='index'") {
return true;
}
rest = &rest[end..];
}
false
}
struct DictCollection {
links: Vec<String>,
}
fn parse_dictionary_collections(opf_path: &std::path::Path) -> Vec<DictCollection> {
let content = match fs::read_to_string(opf_path) {
Ok(c) => c,
Err(_) => return Vec::new(),
};
let mut out = Vec::new();
let mut rest = content.as_str();
while let Some(idx) = rest.find("<collection") {
rest = &rest[idx..];
let tag_end = match rest.find('>') {
Some(e) => e,
None => break,
};
let open_tag = &rest[..tag_end];
let role = extract_dq_attr(open_tag, "role").unwrap_or_default();
let close_needle = "</collection>";
let body_end = match rest.find(close_needle) {
Some(e) => e,
None => break,
};
let body = &rest[tag_end + 1..body_end];
if role.to_ascii_lowercase().contains("dictionary") {
let mut links = Vec::new();
let mut link_rest = body;
while let Some(li) = link_rest.find("<link") {
link_rest = &link_rest[li + "<link".len()..];
let Some(link_end) = link_rest.find('>') else { break };
let link_tag = &link_rest[..link_end];
if let Some(href) = extract_dq_attr(link_tag, "href") {
links.push(href);
}
link_rest = &link_rest[link_end..];
}
out.push(DictCollection { links });
}
rest = &rest[body_end + close_needle.len()..];
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn bcp47_accepts_common_codes() {
assert!(is_valid_bcp47("en"));
assert!(is_valid_bcp47("el"));
assert!(is_valid_bcp47("grc"));
assert!(is_valid_bcp47("en-US"));
assert!(is_valid_bcp47("zh-Hant"));
assert!(is_valid_bcp47("zh-Hant-TW"));
}
#[test]
fn bcp47_rejects_garbage() {
assert!(!is_valid_bcp47(""));
assert!(!is_valid_bcp47("ENGLISH"));
assert!(!is_valid_bcp47("en_US"));
assert!(!is_valid_bcp47("123"));
}
#[test]
fn count_idx_entries_counts_opening_tags() {
let html = r#"<idx:entry><idx:orth value="a"/></idx:entry>
<idx:entry>x</idx:entry>"#;
assert_eq!(count_idx_entries(html), 2);
}
#[test]
fn collect_idx_entry_names_reads_name_attr() {
let html = r#"<idx:entry name="default" scriptable="yes">x</idx:entry>
<idx:entry name="other">y</idx:entry>"#;
let names = collect_idx_entry_names(html);
assert!(names.contains(&"default".to_string()));
assert!(names.contains(&"other".to_string()));
}
#[test]
fn find_empty_orth_lines_flags_missing_value() {
let html = "<idx:entry>\n <idx:orth value=\"\"/>\n <idx:orth value=\"ok\"/>\n</idx:entry>";
let lines = find_empty_orth_lines(html);
assert_eq!(lines, vec![2]);
}
#[test]
fn find_empty_orth_lines_flags_missing_attr() {
let html = "<idx:entry>\n<idx:orth/>\n</idx:entry>";
let lines = find_empty_orth_lines(html);
assert_eq!(lines, vec![2]);
}
#[test]
fn find_empty_orth_lines_ignores_populated() {
let html = "<idx:orth value=\"abc\"/>";
assert!(find_empty_orth_lines(html).is_empty());
}
#[test]
fn find_empty_orth_lines_accepts_body_form_headword() {
let html = "<idx:entry>\n<idx:orth><b>hello</b></idx:orth>\n</idx:entry>";
assert!(find_empty_orth_lines(html).is_empty());
}
#[test]
fn find_empty_orth_lines_accepts_body_with_br() {
let html = "<idx:orth>\n<b>-eresse</b><br/>\n</idx:orth>";
assert!(find_empty_orth_lines(html).is_empty());
}
#[test]
fn find_empty_orth_lines_flags_body_with_only_markup() {
let html = "<idx:orth>\n<br/>\n</idx:orth>";
let lines = find_empty_orth_lines(html);
assert_eq!(lines, vec![1]);
}
#[test]
fn has_non_whitespace_text_strips_tags() {
assert!(!has_non_whitespace_text("<b></b>"));
assert!(!has_non_whitespace_text(" \n\t"));
assert!(has_non_whitespace_text("<b>x</b>"));
assert!(has_non_whitespace_text("hi"));
}
#[test]
fn guide_has_index_reference_detects_type_index() {
let dir = std::env::temp_dir().join(format!("kindling_dict_guide_{}", std::process::id()));
std::fs::create_dir_all(&dir).unwrap();
let path = dir.join("a.opf");
std::fs::write(
&path,
r#"<package><guide><reference type="index" href="a.html"/></guide></package>"#,
)
.unwrap();
assert!(guide_has_index_reference(&path));
std::fs::write(
&path,
r#"<package><guide><reference type="toc" href="a.html"/></guide></package>"#,
)
.unwrap();
assert!(!guide_has_index_reference(&path));
std::fs::remove_dir_all(&dir).ok();
}
#[test]
fn parse_dictionary_collections_extracts_links() {
let dir = std::env::temp_dir().join(format!("kindling_dict_coll_{}", std::process::id()));
std::fs::create_dir_all(&dir).unwrap();
let path = dir.join("b.opf");
std::fs::write(
&path,
r#"<package>
<collection role="dictionary">
<link href="content1.xhtml"/>
<link href="skm.xml"/>
</collection>
</package>"#,
)
.unwrap();
let cols = parse_dictionary_collections(&path);
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].links, vec!["content1.xhtml", "skm.xml"]);
std::fs::remove_dir_all(&dir).ok();
}
#[test]
fn extract_dq_attr_handles_mid_tag() {
let tag = r#" name="default" scriptable="yes""#;
assert_eq!(extract_dq_attr(tag, "name"), Some("default".to_string()));
assert_eq!(
extract_dq_attr(tag, "scriptable"),
Some("yes".to_string())
);
}
}