use std::cell::{Ref, RefCell};
use std::collections::{HashMap, HashSet};
use std::fs;
use std::path::{Path, PathBuf};
use lightningcss::stylesheet::{ParserOptions, StyleSheet};
use crate::epub;
use crate::opf::OPFData;
use crate::profile::Profile;
#[derive(Clone, Debug)]
pub struct CssSummary {
pub parse_error: Option<String>,
pub imports: Vec<(usize, String)>,
pub url_refs: Vec<(usize, String)>,
pub font_faces: Vec<CssFontFace>,
pub namespace_lines: Vec<usize>,
pub media_features: Vec<(usize, String)>,
pub property_names: HashSet<String>,
pub forbidden_positions: Vec<(usize, String)>,
}
#[derive(Clone, Debug)]
pub struct CssFontFace {
pub line: usize,
pub src_urls: Vec<(usize, String)>,
pub missing_src: bool,
}
pub struct ExtractedEpub {
pub root: PathBuf,
pub opf_path: PathBuf,
pub opf: OPFData,
pub profile: Profile,
_temp_dir: Option<PathBuf>,
raw_cache: RefCell<HashMap<String, Option<String>>>,
html_cache: RefCell<HashMap<String, scraper::Html>>,
css_text_cache: RefCell<HashMap<String, String>>,
css_summary_cache: RefCell<HashMap<String, CssSummary>>,
ids_cache: RefCell<HashMap<String, HashSet<String>>>,
}
impl ExtractedEpub {
pub fn from_opf_path(opf_path: &Path) -> Result<Self, Box<dyn std::error::Error>> {
let opf = OPFData::parse(opf_path)?;
let root = opf.base_dir.clone();
let profile = Profile::autodetect(&opf, &root);
Ok(Self {
root,
opf_path: opf_path.to_path_buf(),
opf,
profile,
_temp_dir: None,
raw_cache: RefCell::new(HashMap::new()),
html_cache: RefCell::new(HashMap::new()),
css_text_cache: RefCell::new(HashMap::new()),
css_summary_cache: RefCell::new(HashMap::new()),
ids_cache: RefCell::new(HashMap::new()),
})
}
#[allow(dead_code)]
pub fn from_epub_path(epub_path: &Path) -> Result<Self, Box<dyn std::error::Error>> {
let (temp_dir, opf_path) = epub::extract_epub(epub_path)?;
let mut me = Self::from_opf_path(&opf_path)?;
me._temp_dir = Some(temp_dir);
Ok(me)
}
pub fn read(&self, href: &str) -> Option<String> {
if let Some(cached) = self.raw_cache.borrow().get(href) {
return cached.clone();
}
let path = self.root.join(href);
let content = fs::read_to_string(&path).ok();
self.raw_cache.borrow_mut().insert(href.to_string(), content.clone());
content
}
#[allow(dead_code)]
pub fn html(&self, href: &str) -> Option<Ref<'_, scraper::Html>> {
if !self.html_cache.borrow().contains_key(href) {
let text = self.read(href)?;
let doc = scraper::Html::parse_document(&text);
self.html_cache.borrow_mut().insert(href.to_string(), doc);
}
Some(Ref::map(self.html_cache.borrow(), |m| m.get(href).unwrap()))
}
#[allow(dead_code)]
pub fn css(&self, href: &str) -> Option<Ref<'_, String>> {
if !self.css_text_cache.borrow().contains_key(href) {
let text = self.read(href)?;
self.css_text_cache.borrow_mut().insert(href.to_string(), text);
}
Some(Ref::map(self.css_text_cache.borrow(), |m| m.get(href).unwrap()))
}
#[allow(dead_code)]
pub fn css_summary(&self, href: &str) -> Option<Ref<'_, CssSummary>> {
if !self.css_summary_cache.borrow().contains_key(href) {
let text = {
if !self.css_text_cache.borrow().contains_key(href) {
let raw = self.read(href)?;
self.css_text_cache.borrow_mut().insert(href.to_string(), raw);
}
let cache = self.css_text_cache.borrow();
cache.get(href).unwrap().clone()
};
let summary = build_css_summary(&text);
self.css_summary_cache
.borrow_mut()
.insert(href.to_string(), summary);
}
Some(Ref::map(self.css_summary_cache.borrow(), |m| {
m.get(href).unwrap()
}))
}
#[allow(dead_code)]
pub fn ids(&self, href: &str) -> Option<Ref<'_, HashSet<String>>> {
if !self.ids_cache.borrow().contains_key(href) {
let text = self.read(href)?;
let set = collect_ids(&text);
self.ids_cache.borrow_mut().insert(href.to_string(), set);
}
Some(Ref::map(self.ids_cache.borrow(), |m| m.get(href).unwrap()))
}
#[allow(dead_code)]
pub fn manifest_hrefs(&self) -> HashSet<String> {
self.opf
.manifest
.values()
.map(|(href, _)| match href.find('#') {
Some(i) => href[..i].to_string(),
None => href.clone(),
})
.collect()
}
}
impl Drop for ExtractedEpub {
fn drop(&mut self) {
if let Some(ref dir) = self._temp_dir {
if dir.exists() {
let _ = fs::remove_dir_all(dir);
}
}
}
}
fn build_css_summary(raw_text: &str) -> CssSummary {
let text = strip_css_prologue(raw_text);
let parse_error = match StyleSheet::parse(text, ParserOptions::default()) {
Ok(_) => None,
Err(e) => Some(format!("{}", e)),
};
let imports = find_imports(text);
let url_refs = find_url_refs(text);
let font_faces = find_font_faces(text);
let namespace_lines = find_namespace_lines(text);
let media_features = find_media_features(text);
let property_names = find_property_names(text);
let forbidden_positions = find_forbidden_positions(text);
CssSummary {
parse_error,
imports,
url_refs,
font_faces,
namespace_lines,
media_features,
property_names,
forbidden_positions,
}
}
fn strip_css_prologue(text: &str) -> &str {
let t = text.strip_prefix('\u{FEFF}').unwrap_or(text);
let trimmed = t.trim_start();
let offset = t.len() - trimmed.len();
if !trimmed.starts_with("@charset") {
return t;
}
if let Some(end) = trimmed.find(';') {
return &t[offset + end + 1..];
}
t
}
fn find_imports(text: &str) -> Vec<(usize, String)> {
let mut out = Vec::new();
let lower = text.to_ascii_lowercase();
for (abs, _) in iter_ascii_matches(&lower, "@import") {
let after = abs + "@import".len();
let rest_raw = &text[after..];
let rest = rest_raw.trim_start();
let target = if rest.starts_with("url(") {
parse_url_body(&rest[4..])
} else if rest.starts_with('"') || rest.starts_with('\'') {
let quote = rest.chars().next().unwrap();
let after_q = &rest[1..];
after_q.find(quote).map(|end| after_q[..end].to_string())
} else {
None
};
if let Some(t) = target {
out.push((line_of(text, abs), t));
}
}
out
}
fn find_url_refs(text: &str) -> Vec<(usize, String)> {
let mut out = Vec::new();
let lower = text.to_ascii_lowercase();
let mut pos = 0usize;
while let Some(idx) = lower[pos..].find("url(") {
let abs = pos + idx;
let before = lower[..abs].trim_end();
if before.ends_with("@import") || belongs_to_namespace(before) {
pos = abs + 4;
continue;
}
let body_start = abs + 4;
if let Some(target) = parse_url_body(&text[body_start..]) {
if !target.is_empty() && !target.starts_with("data:") {
out.push((line_of(text, abs), target));
}
}
pos = body_start;
}
out
}
fn belongs_to_namespace(before: &str) -> bool {
if before.ends_with("@namespace") {
return true;
}
let after_ident = before.trim_end_matches(|c: char| {
c.is_ascii_alphanumeric() || c == '-' || c == '_'
});
if after_ident.len() == before.len() {
return false;
}
let after_space = after_ident.trim_end();
after_space.ends_with("@namespace")
}
fn find_font_faces(text: &str) -> Vec<CssFontFace> {
let mut out = Vec::new();
let lower = text.to_ascii_lowercase();
let mut pos = 0usize;
while let Some(idx) = lower[pos..].find("@font-face") {
let abs = pos + idx;
let after = abs + "@font-face".len();
let Some(open_rel) = text[after..].find('{') else { break };
let open = after + open_rel;
let mut depth = 0i32;
let mut close = None;
for (i, c) in text[open..].char_indices() {
match c {
'{' => depth += 1,
'}' => {
depth -= 1;
if depth == 0 {
close = Some(open + i);
break;
}
}
_ => {}
}
}
let Some(close) = close else { break };
let block = &text[open + 1..close];
let block_lower = block.to_ascii_lowercase();
let missing_src = !block_lower.contains("src:");
let block_urls: Vec<(usize, String)> = find_url_refs(block)
.into_iter()
.map(|(rel_line, url)| {
let abs_line = line_of(text, open + 1) + rel_line - 1;
(abs_line, url)
})
.collect();
out.push(CssFontFace {
line: line_of(text, abs),
src_urls: block_urls,
missing_src,
});
pos = close + 1;
}
out
}
fn find_namespace_lines(text: &str) -> Vec<usize> {
let mut out = Vec::new();
let lower = text.to_ascii_lowercase();
for (abs, _) in iter_ascii_matches(&lower, "@namespace") {
if let Some(prev) = lower[..abs].chars().last() {
if prev.is_ascii_alphanumeric() || prev == '-' || prev == '_' {
continue;
}
}
out.push(line_of(text, abs));
}
out
}
const UNSUPPORTED_MEDIA_FEATURES: &[&str] = &[
"hover",
"any-hover",
"pointer",
"any-pointer",
"color-gamut",
"prefers-color-scheme",
];
fn find_media_features(text: &str) -> Vec<(usize, String)> {
let mut out = Vec::new();
let lower = text.to_ascii_lowercase();
let mut pos = 0usize;
while let Some(idx) = lower[pos..].find("@media") {
let abs = pos + idx;
if let Some(prev) = lower[..abs].chars().last() {
if prev.is_ascii_alphanumeric() || prev == '-' || prev == '_' {
pos = abs + "@media".len();
continue;
}
}
let after = abs + "@media".len();
let Some(brace_rel) = text[after..].find('{') else { break };
let prelude = &lower[after..after + brace_rel];
let prelude_orig_start = after;
for feat in UNSUPPORTED_MEDIA_FEATURES {
if let Some(f_idx) = prelude.find(feat) {
let before_ok = if f_idx == 0 {
true
} else {
let prev = prelude.as_bytes()[f_idx - 1];
!prev.is_ascii_alphanumeric() && prev != b'-' && prev != b'_'
};
let end = f_idx + feat.len();
let after_ok = if end == prelude.len() {
true
} else {
let next = prelude.as_bytes()[end];
!next.is_ascii_alphanumeric() && next != b'_'
};
if before_ok && after_ok {
out.push((
line_of(text, prelude_orig_start + f_idx),
(*feat).to_string(),
));
}
}
}
pos = after + brace_rel + 1;
}
out
}
fn find_property_names(text: &str) -> HashSet<String> {
let mut out = HashSet::new();
let lower = text.to_ascii_lowercase();
let bytes = lower.as_bytes();
let len = bytes.len();
let mut i = 0usize;
while i < len {
if i + 1 < len && bytes[i] == b'/' && bytes[i + 1] == b'*' {
if let Some(end) = lower[i + 2..].find("*/") {
i = i + 2 + end + 2;
} else {
break;
}
continue;
}
if bytes[i] == b':' {
let colon = i;
let mut j = colon;
while j > 0 && bytes[j - 1].is_ascii_whitespace() {
j -= 1;
}
let name_end = j;
while j > 0
&& (bytes[j - 1].is_ascii_alphanumeric()
|| bytes[j - 1] == b'-'
|| bytes[j - 1] == b'_')
{
j -= 1;
}
if j < name_end {
let name = &lower[j..name_end];
let boundary_ok = j == 0
|| (!bytes[j - 1].is_ascii_alphanumeric() && bytes[j - 1] != b'_');
if boundary_ok && !name.starts_with("--") {
out.insert(name.to_string());
}
}
}
i += 1;
}
out
}
const FORBIDDEN_POSITION_VALUES: &[&str] = &["fixed", "absolute", "sticky"];
fn find_forbidden_positions(text: &str) -> Vec<(usize, String)> {
let mut out = Vec::new();
let lower = text.to_ascii_lowercase();
for (abs, matched) in iter_ascii_matches(&lower, "position") {
let after = abs + matched.len();
let rest = &lower[after..];
let rest_trim = rest.trim_start();
if !rest_trim.starts_with(':') {
continue;
}
if let Some(prev) = lower[..abs].chars().last() {
if prev.is_ascii_alphanumeric() || prev == '-' || prev == '_' {
continue;
}
}
let value_area = &rest_trim[1..];
let value = value_area
.split(|c: char| c == ';' || c == '}' || c == '!')
.next()
.unwrap_or("")
.trim();
for forbidden in FORBIDDEN_POSITION_VALUES {
if value.eq_ignore_ascii_case(forbidden) {
out.push((line_of(text, abs), (*forbidden).to_string()));
break;
}
}
}
out
}
fn parse_url_body(body: &str) -> Option<String> {
let trimmed = body.trim_start();
let rest: &str;
let end_char: char;
if let Some(stripped) = trimmed.strip_prefix('"') {
rest = stripped;
end_char = '"';
} else if let Some(stripped) = trimmed.strip_prefix('\'') {
rest = stripped;
end_char = '\'';
} else {
let end = trimmed
.find(|c: char| c == ')' || c.is_whitespace())
.unwrap_or(trimmed.len());
return Some(trimmed[..end].to_string());
}
let end = rest.find(end_char)?;
Some(rest[..end].to_string())
}
fn line_of(content: &str, byte_offset: usize) -> usize {
content[..byte_offset.min(content.len())]
.bytes()
.filter(|b| *b == b'\n')
.count()
+ 1
}
fn iter_ascii_matches<'a>(
haystack: &'a str,
needle: &'a str,
) -> impl Iterator<Item = (usize, &'a str)> + 'a {
let mut pos = 0usize;
std::iter::from_fn(move || {
if pos >= haystack.len() {
return None;
}
let idx = haystack[pos..].find(needle)?;
let abs = pos + idx;
pos = abs + needle.len();
Some((abs, needle))
})
}
fn collect_ids(html: &str) -> HashSet<String> {
let mut out = HashSet::new();
let mut rest = html;
while let Some(idx) = rest.find(" id=\"") {
rest = &rest[idx + 5..];
let Some(end) = rest.find('"') else { break };
out.insert(rest[..end].to_string());
rest = &rest[end..];
}
out
}