use quick_xml::events::Event;
use quick_xml::Reader;
use regex::Regex;
use std::collections::HashMap;
use std::path::{Path, PathBuf};
#[derive(Debug, Clone)]
pub struct ManifestItem {
pub id: String,
pub href: String,
pub media_type: String,
pub properties: String,
pub fallback: Option<String>,
pub fallback_style: Option<String>,
}
#[derive(Debug, Clone)]
pub struct SpineItemRef {
pub idref: String,
pub linear: String,
#[allow(dead_code)]
pub properties: String,
}
pub struct OPFData {
pub base_dir: PathBuf,
pub title: String,
pub author: String,
pub language: String,
pub identifier: String,
pub date: String,
pub dict_in_language: String,
pub dict_out_language: String,
pub default_lookup_index: String,
pub spine_items: Vec<(String, String)>, pub manifest: HashMap<String, (String, String)>, pub manifest_items: Vec<ManifestItem>,
pub raw_itemrefs: Vec<SpineItemRef>,
pub coverimage_id: Option<String>,
pub is_fixed_layout: bool,
pub original_resolution: Option<String>,
pub page_progression_direction: Option<String>,
pub package_version: String,
pub dc_types: Vec<String>,
pub embedded_cover_href: Option<String>,
}
impl OPFData {
pub fn parse(opf_path: &Path) -> Result<Self, Box<dyn std::error::Error>> {
let opf_path = opf_path.canonicalize().unwrap_or_else(|_| opf_path.to_path_buf());
let base_dir = opf_path
.parent()
.unwrap_or(Path::new("."))
.to_path_buf();
let content = std::fs::read_to_string(&opf_path)?;
let mut data = OPFData {
base_dir,
title: String::new(),
author: String::new(),
language: String::from("en"),
identifier: String::new(),
date: String::new(),
dict_in_language: String::new(),
dict_out_language: String::new(),
default_lookup_index: String::from("default"),
spine_items: Vec::new(),
manifest: HashMap::new(),
manifest_items: Vec::new(),
raw_itemrefs: Vec::new(),
coverimage_id: None,
is_fixed_layout: false,
original_resolution: None,
page_progression_direction: None,
package_version: String::new(),
dc_types: Vec::new(),
embedded_cover_href: None,
};
let cleaned = clean_opf_xml(&content);
data.parse_xml(&cleaned)?;
Ok(data)
}
fn parse_xml(&mut self, xml: &str) -> Result<(), Box<dyn std::error::Error>> {
let mut reader = Reader::from_str(xml);
reader.config_mut().trim_text(true);
let mut in_metadata = false;
let mut in_manifest = false;
let mut in_spine = false;
let mut current_tag = String::new();
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
let local_name = local_tag_name(e.name().as_ref());
let lower = local_name.to_ascii_lowercase();
match lower.as_str() {
"package" => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"version" {
self.package_version =
String::from_utf8_lossy(&attr.value).to_string();
}
}
}
"metadata" => in_metadata = true,
"manifest" => in_manifest = true,
"spine" => {
in_spine = true;
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"page-progression-direction" {
let ppd = String::from_utf8_lossy(&attr.value).to_string();
if !ppd.is_empty() {
self.page_progression_direction = Some(ppd);
}
}
}
}
"title" | "creator" | "language" | "identifier" | "date"
if in_metadata =>
{
current_tag = lower.clone();
}
"type" if in_metadata => {
current_tag = "type".to_string();
}
"dictionaryinlanguage" if in_metadata => {
current_tag = "DictionaryInLanguage".to_string();
}
"dictionaryoutlanguage" if in_metadata => {
current_tag = "DictionaryOutLanguage".to_string();
}
"defaultlookupindex" if in_metadata => {
current_tag = "DefaultLookupIndex".to_string();
}
"embeddedcover" if in_metadata => {
current_tag = "EmbeddedCover".to_string();
}
"meta" if in_metadata => {
let mut name_val = String::new();
let mut content_val = String::new();
let mut property_val = String::new();
for attr in e.attributes().flatten() {
match attr.key.as_ref() {
b"name" => {
name_val = String::from_utf8_lossy(&attr.value).to_string();
}
b"content" => {
content_val = String::from_utf8_lossy(&attr.value).to_string();
}
b"property" => {
property_val = String::from_utf8_lossy(&attr.value).to_string();
}
_ => {}
}
}
if name_val == "fixed-layout" && content_val == "true" {
self.is_fixed_layout = true;
}
if name_val == "original-resolution" && !content_val.is_empty() {
self.original_resolution = Some(content_val.clone());
}
if property_val == "rendition:layout" {
current_tag = "rendition:layout".to_string();
}
}
"item" if in_manifest => {
let mut id = String::new();
let mut href = String::new();
let mut media_type = String::new();
let mut properties = String::new();
let mut fallback: Option<String> = None;
let mut fallback_style: Option<String> = None;
for attr in e.attributes().flatten() {
match attr.key.as_ref() {
b"id" => {
id = String::from_utf8_lossy(&attr.value).to_string()
}
b"href" => {
href = String::from_utf8_lossy(&attr.value).to_string()
}
b"media-type" => {
media_type =
String::from_utf8_lossy(&attr.value).to_string()
}
b"properties" => {
properties =
String::from_utf8_lossy(&attr.value).to_string()
}
b"fallback" => {
fallback = Some(
String::from_utf8_lossy(&attr.value).to_string(),
);
}
b"fallback-style" => {
fallback_style = Some(
String::from_utf8_lossy(&attr.value).to_string(),
);
}
_ => {}
}
}
if !id.is_empty() {
if properties.split_whitespace().any(|p| p == "coverimage") {
self.coverimage_id = Some(id.clone());
}
self.manifest.insert(id.clone(), (href.clone(), media_type.clone()));
self.manifest_items.push(ManifestItem {
id,
href,
media_type,
properties,
fallback,
fallback_style,
});
}
}
"itemref" if in_spine => {
let mut idref = String::new();
let mut linear = String::new();
let mut properties = String::new();
for attr in e.attributes().flatten() {
match attr.key.as_ref() {
b"idref" => {
idref =
String::from_utf8_lossy(&attr.value).to_string();
}
b"linear" => {
linear =
String::from_utf8_lossy(&attr.value).to_string();
}
b"properties" => {
properties =
String::from_utf8_lossy(&attr.value).to_string();
}
_ => {}
}
}
self.raw_itemrefs.push(SpineItemRef {
idref: idref.clone(),
linear,
properties,
});
if let Some((href, _)) = self.manifest.get(&idref) {
self.spine_items.push((idref, href.clone()));
}
}
_ => {}
}
}
Ok(Event::Text(ref e)) => {
let text = e.unescape().unwrap_or_default().trim().to_string();
if !text.is_empty() && in_metadata {
match current_tag.as_str() {
"title" => self.title = text,
"creator" => self.author = text,
"language" => self.language = text,
"identifier" => self.identifier = text,
"date" => self.date = text,
"type" => self.dc_types.push(text),
"DictionaryInLanguage" => self.dict_in_language = text,
"DictionaryOutLanguage" => self.dict_out_language = text,
"DefaultLookupIndex" => self.default_lookup_index = text,
"EmbeddedCover" => self.embedded_cover_href = Some(text),
"rendition:layout" => {
if text == "pre-paginated" {
self.is_fixed_layout = true;
}
}
_ => {}
}
}
}
Ok(Event::End(ref e)) => {
let local_name = local_tag_name(e.name().as_ref());
match local_name.to_ascii_lowercase().as_str() {
"metadata" => in_metadata = false,
"manifest" => in_manifest = false,
"spine" => in_spine = false,
_ => {}
}
current_tag.clear();
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
buf.clear();
}
Ok(())
}
pub fn get_content_html_paths(&self) -> Vec<PathBuf> {
self.spine_items
.iter()
.filter_map(|(_, href)| {
let full_path = self.base_dir.join(href);
if full_path.exists() {
Some(full_path)
} else {
None
}
})
.collect()
}
pub fn get_image_items(&self) -> Vec<(String, String)> {
let mut items: Vec<(String, String, String)> = self
.manifest
.iter()
.filter(|(_, (_, media_type))| media_type.starts_with("image/"))
.map(|(id, (href, media_type))| (id.clone(), href.clone(), media_type.clone()))
.collect();
items.sort_by(|a, b| a.0.cmp(&b.0));
items.into_iter().map(|(_, href, mt)| (href, mt)).collect()
}
pub fn find_unreferenced_images(&self) -> Vec<(String, String)> {
use std::collections::BTreeSet;
use std::sync::OnceLock;
static IMG_SRC_RE: OnceLock<Regex> = OnceLock::new();
let img_src_re = IMG_SRC_RE.get_or_init(|| {
Regex::new(r#"(?i)<img\b[^>]*?\bsrc\s*=\s*"([^"]*)""#).unwrap()
});
let manifest_hrefs: HashMap<String, ()> = self
.manifest_items
.iter()
.map(|item| (item.href.clone(), ()))
.collect();
let mut found: BTreeSet<String> = BTreeSet::new();
for html_path in self.get_content_html_paths() {
let content = match std::fs::read_to_string(&html_path) {
Ok(c) => c,
Err(_) => continue,
};
let html_dir = html_path
.parent()
.unwrap_or_else(|| std::path::Path::new("."))
.to_path_buf();
for cap in img_src_re.captures_iter(&content) {
let src = cap.get(1).unwrap().as_str();
if src.is_empty() {
continue;
}
if src.contains("://") || src.starts_with("data:") || src.starts_with("#") {
continue;
}
let src_no_frag = src.split('#').next().unwrap_or(src);
let src_decoded = super_percent_decode(src_no_frag);
let candidate = html_dir.join(&src_decoded);
let canonical = candidate.canonicalize().unwrap_or(candidate);
if !canonical.is_file() {
continue;
}
let base_canonical = self
.base_dir
.canonicalize()
.unwrap_or_else(|_| self.base_dir.clone());
let rel = match canonical.strip_prefix(&base_canonical) {
Ok(p) => p,
Err(_) => continue,
};
let href = rel
.components()
.map(|c| c.as_os_str().to_string_lossy().into_owned())
.collect::<Vec<_>>()
.join("/");
if href.is_empty() || manifest_hrefs.contains_key(&href) {
continue;
}
found.insert(href);
}
}
found
.into_iter()
.filter_map(|href| {
guess_image_media_type(&href).map(|mt| (href, mt.to_string()))
})
.collect()
}
pub fn get_cover_image_id(&self) -> Option<String> {
if let Some(ref cover_id) = self.coverimage_id {
if let Some((_, media_type)) = self.manifest.get(cover_id) {
if media_type.starts_with("image/") {
return Some(cover_id.clone());
}
}
}
let opf_path = self.base_dir.join(
std::fs::read_dir(&self.base_dir)
.ok()?
.filter_map(|e| e.ok())
.find(|e| {
e.path()
.extension()
.map(|ext| ext == "opf")
.unwrap_or(false)
})?
.file_name(),
);
let content = std::fs::read_to_string(&opf_path).ok()?;
let cleaned = clean_opf_xml(&content);
let mut reader = Reader::from_str(&cleaned);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
let local_name = local_tag_name(e.name().as_ref());
if local_name == "meta" {
let mut name_val = String::new();
let mut content_val = String::new();
for attr in e.attributes().flatten() {
match attr.key.as_ref() {
b"name" => {
name_val =
String::from_utf8_lossy(&attr.value).to_string()
}
b"content" => {
content_val =
String::from_utf8_lossy(&attr.value).to_string()
}
_ => {}
}
}
if name_val == "cover" && !content_val.is_empty() {
if let Some((_, media_type)) = self.manifest.get(&content_val) {
if media_type.starts_with("image/") {
return Some(content_val);
}
}
}
}
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
buf.clear();
}
if let Some(href) = &self.embedded_cover_href {
for item in &self.manifest_items {
if item.href == *href && item.media_type.starts_with("image/") {
return Some(item.id.clone());
}
}
}
None
}
pub fn get_cover_image_href(&self) -> Option<String> {
if let Some(ref cover_id) = self.coverimage_id {
if let Some((href, media_type)) = self.manifest.get(cover_id) {
if media_type.starts_with("image/") {
return Some(href.clone());
}
}
}
let opf_path = self.base_dir.join(
std::fs::read_dir(&self.base_dir)
.ok()?
.filter_map(|e| e.ok())
.find(|e| {
e.path()
.extension()
.map(|ext| ext == "opf")
.unwrap_or(false)
})?
.file_name(),
);
let content = std::fs::read_to_string(&opf_path).ok()?;
let cleaned = clean_opf_xml(&content);
let mut reader = Reader::from_str(&cleaned);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
let local_name = local_tag_name(e.name().as_ref());
if local_name == "meta" {
let mut name_val = String::new();
let mut content_val = String::new();
for attr in e.attributes().flatten() {
match attr.key.as_ref() {
b"name" => {
name_val =
String::from_utf8_lossy(&attr.value).to_string()
}
b"content" => {
content_val =
String::from_utf8_lossy(&attr.value).to_string()
}
_ => {}
}
}
if name_val == "cover" && !content_val.is_empty() {
if let Some((href, media_type)) = self.manifest.get(&content_val) {
if media_type.starts_with("image/") {
return Some(href.clone());
}
}
}
}
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
buf.clear();
}
if let Some(href) = &self.embedded_cover_href {
for item in &self.manifest_items {
if item.href == *href && item.media_type.starts_with("image/") {
return Some(href.clone());
}
}
}
None
}
#[allow(dead_code)]
pub fn is_dictionary(&self) -> bool {
!self.dict_in_language.is_empty() || !self.dict_out_language.is_empty()
}
}
#[derive(Debug)]
#[allow(dead_code)]
pub struct DictionaryEntry {
pub headword: String,
pub inflections: Vec<String>,
pub html_content: String,
}
pub fn parse_dictionary_html(html_path: &Path) -> Result<Vec<DictionaryEntry>, std::io::Error> {
let content = std::fs::read_to_string(html_path)?;
let mut entries = Vec::new();
use std::sync::OnceLock;
static ORTH_VAL_RE: OnceLock<Regex> = OnceLock::new();
static IFORM_RE: OnceLock<Regex> = OnceLock::new();
let orth_val_re =
ORTH_VAL_RE.get_or_init(|| Regex::new(r#"<idx:orth\b[^>]*\svalue="([^"]*)""#).unwrap());
let iform_re = IFORM_RE.get_or_init(|| Regex::new(r#"<idx:iform\s+value="([^"]*)""#).unwrap());
let entry_open = "<idx:entry";
let entry_close = "</idx:entry>";
let mut search_pos = 0;
while let Some(start) = content[search_pos..].find(entry_open) {
let abs_start = search_pos + start;
let after_open = match content[abs_start..].find('>') {
Some(p) => abs_start + p + 1,
None => break,
};
let close_pos = match content[after_open..].find(entry_close) {
Some(p) => after_open + p,
None => break,
};
let entry_inner = &content[after_open..close_pos];
let full_entry = &content[abs_start..close_pos + entry_close.len()];
let headword = if let Some(cap) = orth_val_re.captures(entry_inner) {
unescape_html(cap.get(1).unwrap().as_str())
} else if let Some(hw) = extract_orth_body_text(entry_inner) {
hw
} else {
search_pos = close_pos + entry_close.len();
continue;
};
if headword.is_empty() {
search_pos = close_pos + entry_close.len();
continue;
}
let inflections: Vec<String> = iform_re
.captures_iter(entry_inner)
.map(|cap| unescape_html(cap.get(1).unwrap().as_str()))
.collect();
entries.push(DictionaryEntry {
headword,
inflections,
html_content: full_entry.to_string(),
});
search_pos = close_pos + entry_close.len();
}
Ok(entries)
}
fn extract_orth_body_text(entry_inner: &str) -> Option<String> {
let open = entry_inner.find("<idx:orth")?;
let after_tag_name = open + "<idx:orth".len();
let gt = entry_inner[after_tag_name..].find('>')?;
let tag_close = after_tag_name + gt;
if entry_inner.as_bytes().get(tag_close.saturating_sub(1)) == Some(&b'/') {
return None;
}
let body_start = tag_close + 1;
let close_rel = entry_inner[body_start..].find("</idx:orth>")?;
let body = &entry_inner[body_start..body_start + close_rel];
let mut cleaned = String::with_capacity(body.len());
let mut rest = body;
loop {
match rest.find("<idx:infl") {
Some(i) => {
cleaned.push_str(&rest[..i]);
match rest[i..].find("</idx:infl>") {
Some(j) => {
rest = &rest[i + j + "</idx:infl>".len()..];
}
None => break,
}
}
None => {
cleaned.push_str(rest);
break;
}
}
}
let mut out = String::with_capacity(cleaned.len());
let mut in_tag = false;
for ch in cleaned.chars() {
match ch {
'<' => in_tag = true,
'>' => in_tag = false,
c if !in_tag => out.push(c),
_ => {}
}
}
let unescaped = unescape_html(&out);
let trimmed = unescaped.split_whitespace().collect::<Vec<_>>().join(" ");
if trimmed.is_empty() {
None
} else {
Some(trimmed)
}
}
fn unescape_html(text: &str) -> String {
use std::sync::OnceLock;
static HEX_RE: OnceLock<Regex> = OnceLock::new();
static DEC_RE: OnceLock<Regex> = OnceLock::new();
if !text.contains('&') {
return text.to_string();
}
let mut result = text.to_string();
let hex_re = HEX_RE.get_or_init(|| Regex::new(r"&#x([0-9a-fA-F]+);").unwrap());
result = hex_re
.replace_all(&result, |caps: ®ex::Captures| {
let hex = caps.get(1).unwrap().as_str();
match u32::from_str_radix(hex, 16) {
Ok(cp) => char::from_u32(cp).unwrap_or('\u{FFFD}').to_string(),
Err(_) => caps.get(0).unwrap().as_str().to_string(),
}
})
.to_string();
let dec_re = DEC_RE.get_or_init(|| Regex::new(r"&#(\d+);").unwrap());
result = dec_re
.replace_all(&result, |caps: ®ex::Captures| {
let num = caps.get(1).unwrap().as_str();
match num.parse::<u32>() {
Ok(cp) => char::from_u32(cp).unwrap_or('\u{FFFD}').to_string(),
Err(_) => caps.get(0).unwrap().as_str().to_string(),
}
})
.to_string();
result = result.replace("&", "&");
result = result.replace("<", "<");
result = result.replace(">", ">");
result = result.replace(""", "\"");
result = result.replace("'", "'");
result = result.replace(" ", "\u{00A0}");
result
}
fn super_percent_decode(s: &str) -> String {
if !s.contains('%') {
return s.to_string();
}
let bytes = s.as_bytes();
let mut out: Vec<u8> = Vec::with_capacity(bytes.len());
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b'%' && i + 2 < bytes.len() {
let hi = (bytes[i + 1] as char).to_digit(16);
let lo = (bytes[i + 2] as char).to_digit(16);
if let (Some(h), Some(l)) = (hi, lo) {
out.push(((h << 4) | l) as u8);
i += 3;
continue;
}
}
out.push(bytes[i]);
i += 1;
}
String::from_utf8(out).unwrap_or_else(|_| s.to_string())
}
fn guess_image_media_type(href: &str) -> Option<&'static str> {
let lower = href.to_ascii_lowercase();
let ext = lower.rsplit_once('.').map(|(_, e)| e).unwrap_or("");
match ext {
"jpg" | "jpeg" => Some("image/jpeg"),
"png" => Some("image/png"),
"gif" => Some("image/gif"),
"bmp" => Some("image/bmp"),
"svg" => Some("image/svg+xml"),
_ => None,
}
}
fn clean_opf_xml(content: &str) -> String {
let mut cleaned = content.to_string();
let xml_decl = Regex::new(r"<\?xml[^?]*\?>").unwrap();
cleaned = xml_decl.replace_all(&cleaned, "").to_string();
let opf_prefix = Regex::new(r"\bopf:").unwrap();
cleaned = opf_prefix.replace_all(&cleaned, "").to_string();
let dc_prefix = Regex::new(r"\bdc:").unwrap();
cleaned = dc_prefix.replace_all(&cleaned, "").to_string();
let xmlns_attr = Regex::new(r#"\s+xmlns:\w+="[^"]*""#).unwrap();
cleaned = xmlns_attr.replace_all(&cleaned, "").to_string();
cleaned
}
fn local_tag_name(name: &[u8]) -> String {
let s = std::str::from_utf8(name).unwrap_or("");
if let Some(pos) = s.rfind('}') {
s[pos + 1..].to_string()
} else if let Some(pos) = s.rfind(':') {
s[pos + 1..].to_string()
} else {
s.to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
fn temp_dir(tag: &str) -> std::path::PathBuf {
let d = std::env::temp_dir().join(format!(
"kindling_opf_test_{}_{}",
tag,
std::process::id()
));
if d.exists() {
fs::remove_dir_all(&d).unwrap();
}
fs::create_dir_all(&d).unwrap();
d
}
#[test]
fn extract_orth_body_text_plain() {
let entry = r#"<idx:orth><b>hello</b></idx:orth>body"#;
assert_eq!(extract_orth_body_text(entry), Some("hello".to_string()));
}
#[test]
fn extract_orth_body_text_with_br_and_whitespace() {
let entry = "<idx:orth>\n<b>-eresse</b><br/>\n</idx:orth>";
assert_eq!(extract_orth_body_text(entry), Some("-eresse".to_string()));
}
#[test]
fn extract_orth_body_text_skips_idx_infl() {
let entry = r#"<idx:orth><b>grogner</b><idx:infl><idx:iform value="grognasses"/></idx:infl></idx:orth>"#;
assert_eq!(extract_orth_body_text(entry), Some("grogner".to_string()));
}
#[test]
fn extract_orth_body_text_self_closing_returns_none() {
let entry = r#"<idx:orth value="x"/>"#;
assert_eq!(extract_orth_body_text(entry), None);
}
#[test]
fn extract_orth_body_text_empty_body_returns_none() {
let entry = r#"<idx:orth></idx:orth>"#;
assert_eq!(extract_orth_body_text(entry), None);
}
#[test]
fn extract_orth_body_text_unescapes_entities() {
let entry = r#"<idx:orth><b>café</b></idx:orth>"#;
assert_eq!(extract_orth_body_text(entry), Some("café".to_string()));
}
#[test]
fn parse_dictionary_html_body_form_idx_orth() {
let dir = temp_dir("body_form");
let html = r#"<html><head></head><body><mbp:frameset>
<idx:entry scriptable="yes" spell="yes">
<idx:orth>
<b>alpha</b><br/>
</idx:orth>
first letter
</idx:entry>
<hr/>
<idx:entry>
<idx:orth><b>beta</b></idx:orth>
second letter
</idx:entry>
</mbp:frameset></body></html>"#;
let path = dir.join("c.html");
fs::write(&path, html).unwrap();
let entries = parse_dictionary_html(&path).unwrap();
assert_eq!(entries.len(), 2);
assert_eq!(entries[0].headword, "alpha");
assert_eq!(entries[1].headword, "beta");
fs::remove_dir_all(&dir).ok();
}
#[test]
fn parse_dictionary_html_body_form_with_inflections() {
let dir = temp_dir("body_form_infl");
let html = r#"<html><body>
<idx:entry>
<idx:orth><b>grogner</b>
<idx:infl>
<idx:iform value="grognes"/>
<idx:iform value="grognasses"/>
</idx:infl>
</idx:orth>
verb
</idx:entry>
</body></html>"#;
let path = dir.join("c.html");
fs::write(&path, html).unwrap();
let entries = parse_dictionary_html(&path).unwrap();
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].headword, "grogner");
assert_eq!(entries[0].inflections, vec!["grognes", "grognasses"]);
fs::remove_dir_all(&dir).ok();
}
#[test]
fn parse_dictionary_html_attribute_form_still_works() {
let dir = temp_dir("attr_form");
let html = r#"<html><body>
<idx:entry><idx:orth value="cat">cat</idx:orth><b>cat</b> animal<hr/></idx:entry>
</body></html>"#;
let path = dir.join("c.html");
fs::write(&path, html).unwrap();
let entries = parse_dictionary_html(&path).unwrap();
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].headword, "cat");
fs::remove_dir_all(&dir).ok();
}
#[test]
fn opf_parse_oeb1x_capitalized_dc_elements() {
let dir = temp_dir("oeb1x");
let opf = r#"<?xml version="1.0" encoding="utf-8"?>
<package unique-identifier="uid">
<metadata>
<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">
<dc:Title>PyGlossary Dict</dc:Title>
<dc:Language>fr</dc:Language>
<dc:Identifier id="uid">abc123</dc:Identifier>
<dc:Creator>An Author</dc:Creator>
</dc-metadata>
<x-metadata>
<DictionaryInLanguage>fr</DictionaryInLanguage>
<DictionaryOutLanguage>fr</DictionaryOutLanguage>
<EmbeddedCover>cover.png</EmbeddedCover>
</x-metadata>
</metadata>
<manifest>
<item id="cover.png" href="cover.png" media-type="image/png"/>
<item id="c" href="c.xhtml" media-type="application/xhtml+xml"/>
</manifest>
<spine><itemref idref="c"/></spine>
</package>"#;
let opf_path = dir.join("content.opf");
fs::write(&opf_path, opf).unwrap();
let data = OPFData::parse(&opf_path).unwrap();
assert_eq!(data.title, "PyGlossary Dict");
assert_eq!(data.language, "fr");
assert_eq!(data.identifier, "abc123");
assert_eq!(data.author, "An Author");
assert_eq!(data.dict_in_language, "fr");
assert_eq!(data.dict_out_language, "fr");
assert_eq!(data.embedded_cover_href.as_deref(), Some("cover.png"));
assert_eq!(data.get_cover_image_href().as_deref(), Some("cover.png"));
assert_eq!(data.get_cover_image_id().as_deref(), Some("cover.png"));
fs::remove_dir_all(&dir).ok();
}
#[test]
fn opf_parse_modern_lowercase_dc_elements_unchanged() {
let dir = temp_dir("modern_dc");
let opf = r#"<?xml version="1.0"?>
<package unique-identifier="u" xmlns="http://www.idpf.org/2007/opf">
<metadata>
<dc:title xmlns:dc="http://purl.org/dc/elements/1.1/">Lower Dict</dc:title>
<dc:language xmlns:dc="http://purl.org/dc/elements/1.1/">en</dc:language>
<dc:identifier xmlns:dc="http://purl.org/dc/elements/1.1/" id="u">x</dc:identifier>
</metadata>
<manifest><item id="c" href="c.xhtml" media-type="application/xhtml+xml"/></manifest>
<spine><itemref idref="c"/></spine>
</package>"#;
let opf_path = dir.join("content.opf");
fs::write(&opf_path, opf).unwrap();
let data = OPFData::parse(&opf_path).unwrap();
assert_eq!(data.title, "Lower Dict");
assert_eq!(data.language, "en");
assert_eq!(data.identifier, "x");
assert_eq!(data.embedded_cover_href, None);
fs::remove_dir_all(&dir).ok();
}
#[test]
fn embedded_cover_ignored_if_not_in_manifest() {
let dir = temp_dir("embedded_cover_nomanifest");
let opf = r#"<?xml version="1.0"?>
<package unique-identifier="u">
<metadata>
<x-metadata><EmbeddedCover>missing.png</EmbeddedCover></x-metadata>
</metadata>
<manifest><item id="c" href="c.xhtml" media-type="application/xhtml+xml"/></manifest>
<spine><itemref idref="c"/></spine>
</package>"#;
let opf_path = dir.join("content.opf");
fs::write(&opf_path, opf).unwrap();
let data = OPFData::parse(&opf_path).unwrap();
assert_eq!(data.embedded_cover_href.as_deref(), Some("missing.png"));
assert_eq!(data.get_cover_image_href(), None);
fs::remove_dir_all(&dir).ok();
}
#[test]
fn find_unreferenced_images_picks_up_undeclared_gif() {
let dir = temp_dir("undeclared_img");
let opf = r#"<?xml version="1.0" encoding="utf-8"?>
<package unique-identifier="u">
<metadata>
<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">
<dc:Title>T</dc:Title><dc:Language>fr</dc:Language><dc:Identifier id="u">x</dc:Identifier>
</dc-metadata>
<x-metadata><DictionaryInLanguage>fr</DictionaryInLanguage></x-metadata>
</metadata>
<manifest><item id="c" href="c.xhtml" media-type="application/xhtml+xml"/></manifest>
<spine><itemref idref="c"/></spine>
</package>"#;
let opf_path = dir.join("content.opf");
fs::write(&opf_path, opf).unwrap();
let html = r#"<html><body><mbp:frameset>
<idx:entry><idx:orth value="djed"/><b>djed</b>
<img src="./glyph.gif" alt="g"/>
</idx:entry></mbp:frameset></body></html>"#;
fs::write(dir.join("c.xhtml"), html).unwrap();
fs::write(dir.join("glyph.gif"), b"GIF89aSTUB").unwrap();
let data = OPFData::parse(&opf_path).unwrap();
let extras = data.find_unreferenced_images();
assert_eq!(extras.len(), 1);
assert_eq!(extras[0].0, "glyph.gif");
assert_eq!(extras[0].1, "image/gif");
fs::remove_dir_all(&dir).ok();
}
#[test]
fn find_unreferenced_images_skips_declared_and_missing_files() {
let dir = temp_dir("declared_and_missing");
let opf = r#"<?xml version="1.0" encoding="utf-8"?>
<package unique-identifier="u">
<metadata>
<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">
<dc:Title>T</dc:Title><dc:Language>en</dc:Language><dc:Identifier id="u">x</dc:Identifier>
</dc-metadata>
</metadata>
<manifest>
<item id="c" href="c.xhtml" media-type="application/xhtml+xml"/>
<item id="cov" href="cover.png" media-type="image/png"/>
</manifest>
<spine><itemref idref="c"/></spine>
</package>"#;
let opf_path = dir.join("content.opf");
fs::write(&opf_path, opf).unwrap();
let html = r#"<html><body>
<img src="cover.png"/>
<img src="does-not-exist.png"/>
<img src="https://example.com/remote.png"/>
</body></html>"#;
fs::write(dir.join("c.xhtml"), html).unwrap();
fs::write(dir.join("cover.png"), b"\x89PNG\r\n\x1a\n").unwrap();
let data = OPFData::parse(&opf_path).unwrap();
let extras = data.find_unreferenced_images();
assert!(extras.is_empty(), "expected no extras, got {:?}", extras);
fs::remove_dir_all(&dir).ok();
}
#[test]
fn find_unreferenced_images_handles_percent_encoded_src() {
let dir = temp_dir("percent_encoded_img");
let opf = r#"<?xml version="1.0" encoding="utf-8"?>
<package unique-identifier="u">
<metadata>
<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">
<dc:Title>T</dc:Title><dc:Language>en</dc:Language><dc:Identifier id="u">x</dc:Identifier>
</dc-metadata>
</metadata>
<manifest><item id="c" href="c.xhtml" media-type="application/xhtml+xml"/></manifest>
<spine><itemref idref="c"/></spine>
</package>"#;
let opf_path = dir.join("content.opf");
fs::write(&opf_path, opf).unwrap();
let html = r#"<html><body><img src="my%20glyph.png"/></body></html>"#;
fs::write(dir.join("c.xhtml"), html).unwrap();
fs::write(dir.join("my glyph.png"), b"\x89PNG\r\n\x1a\n").unwrap();
let data = OPFData::parse(&opf_path).unwrap();
let extras = data.find_unreferenced_images();
assert_eq!(extras.len(), 1);
assert_eq!(extras[0].0, "my glyph.png");
fs::remove_dir_all(&dir).ok();
}
}