use std::{
cmp,
collections::HashMap,
io::{Read, Seek},
path::PathBuf,
};
#[cfg(feature = "builder")]
use chrono::Local;
use quick_xml::{NsReader, events::Event};
use sha1::{Digest, Sha1};
use zip::{CompressionMethod, ZipArchive};
use crate::error::EpubError;
#[cfg(feature = "builder")]
pub static ELEMENT_IN_DC_NAMESPACE: std::sync::LazyLock<Vec<&str>> =
std::sync::LazyLock::new(|| {
vec![
"contributor",
"coverage",
"creator",
"date",
"description",
"format",
"identifier",
"language",
"publisher",
"relation",
"rights",
"source",
"subject",
"title",
"type",
]
});
#[cfg(feature = "builder")]
pub fn local_time() -> String {
Local::now().format("%Y-%m-%dT%H-%M-%S.%fU%z").to_string()
}
pub fn get_file_in_zip_archive<R: Read + Seek>(
zip_file: &mut ZipArchive<R>,
file_name: &str,
) -> Result<Vec<u8>, EpubError> {
let mut buffer = Vec::<u8>::new();
match zip_file.by_name(file_name) {
Ok(mut file) => {
let _ = file.read_to_end(&mut buffer).map_err(EpubError::from)?;
Ok(buffer)
}
Err(err) => Err(EpubError::from(err)),
}
}
pub fn compression_method_check<R: Read + Seek>(
zip_archive: &mut ZipArchive<R>,
) -> Result<(), EpubError> {
for index in 0..zip_archive.len() {
let file = zip_archive.by_index(index)?;
match file.compression() {
CompressionMethod::Stored | CompressionMethod::Deflated => continue,
method => {
return Err(EpubError::UnusableCompressionMethod {
file: file.name().to_string(),
method: method.to_string(),
});
}
};
}
Ok(())
}
pub fn check_realtive_link_leakage(
epub_path: PathBuf,
current_dir: PathBuf,
check_file: &str,
) -> Option<String> {
let parts = check_file.split("../").collect::<Vec<&str>>();
let folder_depth = parts.len() - 1;
let remaining = *parts.last().unwrap_or(&"");
let mut current_path = epub_path.join(current_dir);
for _ in 0..folder_depth {
if !current_path.pop() {
return None;
}
}
let prefix_path = match current_path.strip_prefix(&epub_path) {
Ok(path) => path.to_str().unwrap(),
Err(_) => return None, };
let path = match prefix_path {
"" => remaining.to_string(),
_ => format!("{}/{}", prefix_path, remaining),
};
Some(path)
}
#[cfg(feature = "builder")]
pub fn remove_leading_slash<P: AsRef<std::path::Path>>(path: P) -> PathBuf {
if let Ok(path) = path.as_ref().strip_prefix("/") {
path.to_path_buf()
} else {
path.as_ref().to_path_buf()
}
}
pub fn idpf_font_encryption(data: &[u8], key: &str) -> Vec<u8> {
if data.is_empty() {
return Vec::new();
}
let hash = {
let mut hasher = Sha1::new();
hasher.update(key.as_bytes());
hasher.finalize()
};
let mut obfuscated_data = data.to_vec();
let limit = cmp::min(1040, data.len());
for (index, byte) in obfuscated_data.iter_mut().take(limit).enumerate() {
*byte ^= hash[index % hash.len()]
}
obfuscated_data
}
pub fn idpf_font_dencryption(data: &[u8], key: &str) -> Vec<u8> {
idpf_font_encryption(data, key)
}
pub fn adobe_font_encryption(data: &[u8], key: &str) -> Vec<u8> {
if data.is_empty() {
return Vec::new();
}
let mut obfuscated_data = data.to_vec();
let limit = cmp::min(1024, data.len());
for (index, byte) in obfuscated_data.iter_mut().take(limit).enumerate() {
*byte ^= key.as_bytes()[index % 16];
}
obfuscated_data
}
pub fn adobe_font_dencryption(data: &[u8], key: &str) -> Vec<u8> {
adobe_font_encryption(data, key)
}
pub trait DecodeBytes {
fn decode(&self) -> Result<String, EpubError>;
}
impl DecodeBytes for Vec<u8> {
fn decode(&self) -> Result<String, EpubError> {
if self.is_empty() || self.len() < 4 {
return Err(EpubError::EmptyDataError);
}
match self.as_slice() {
[0xEF, 0xBB, 0xBF, rest @ ..] => {
String::from_utf8(rest.to_vec()).map_err(EpubError::from)
}
[0xFE, 0xFF, rest @ ..] => {
let utf16_units = rest
.chunks_exact(2)
.map(|b| u16::from_be_bytes([b[0], b[1]]))
.collect::<Vec<u16>>();
String::from_utf16(&utf16_units).map_err(EpubError::from)
}
[0xFF, 0xFE, rest @ ..] => {
let utf16_units = rest
.chunks_exact(2)
.map(|b| u16::from_le_bytes([b[0], b[1]]))
.collect::<Vec<u16>>();
String::from_utf16(&utf16_units).map_err(EpubError::from)
}
_ => {
let lossless = String::from_utf8_lossy(self);
if !lossless.contains('\u{FFFD}') {
return Ok(lossless.into_owned());
}
if self.len() % 2 == 0 {
if let Ok(str) = String::from_utf16(
&self
.chunks_exact(2)
.map(|b| u16::from_be_bytes([b[0], b[1]]))
.collect::<Vec<u16>>(),
) {
return Ok(str);
}
if let Ok(str) = String::from_utf16(
&self
.chunks_exact(2)
.map(|b| u16::from_le_bytes([b[0], b[1]]))
.collect::<Vec<u16>>(),
) {
return Ok(str);
}
}
Ok(String::from_utf8_lossy(self).to_string())
}
}
}
}
pub trait NormalizeWhitespace {
fn normalize_whitespace(&self) -> String;
}
impl NormalizeWhitespace for &str {
fn normalize_whitespace(&self) -> String {
let mut result = String::new();
let mut is_first = true;
for word in self.split_whitespace() {
if !is_first {
result.push(' ');
}
result.push_str(word);
is_first = false;
}
result
}
}
impl NormalizeWhitespace for String {
fn normalize_whitespace(&self) -> String {
self.as_str().normalize_whitespace()
}
}
#[derive(Debug)]
pub struct XmlElement {
pub name: String,
pub prefix: Option<String>,
pub namespace: Option<String>,
pub attributes: HashMap<String, String>,
pub text: Option<String>,
pub cdata: Option<String>,
pub children: Vec<XmlElement>,
}
impl XmlElement {
pub fn new(name: String) -> Self {
Self {
name,
prefix: None,
namespace: None,
attributes: HashMap::new(),
text: None,
cdata: None,
children: Vec::new(),
}
}
pub fn tag_name(&self) -> String {
match &self.prefix {
Some(prefix) => format!("{}:{}", prefix, self.name),
None => self.name.clone(),
}
}
pub fn text(&self) -> String {
let mut result = String::new();
if let Some(text_value) = &self.text {
result.push_str(text_value);
}
for child in &self.children {
result.push_str(&child.text());
}
result.trim().to_string()
}
pub fn get_attr(&self, name: &str) -> Option<String> {
self.attributes.get(name).cloned()
}
pub fn find_elements_by_name(&self, name: &str) -> impl Iterator<Item = &XmlElement> {
SearchElementsByNameIter::new(self, name)
}
pub fn find_children_by_name(&self, name: &str) -> impl Iterator<Item = &XmlElement> {
self.children.iter().filter(move |child| child.name == name)
}
pub fn find_children_by_names(&self, names: &[&str]) -> impl Iterator<Item = &XmlElement> {
self.children
.iter()
.filter(move |child| names.contains(&child.name.as_str()))
}
pub fn children(&self) -> impl Iterator<Item = &XmlElement> {
self.children.iter()
}
}
struct SearchElementsByNameIter<'a> {
elements: Vec<&'a XmlElement>,
current_index: usize,
target_name: String,
}
impl<'a> SearchElementsByNameIter<'a> {
fn new(root: &'a XmlElement, name: &str) -> Self {
let mut elements = Vec::new();
Self::collect_elements(root, &mut elements);
Self {
elements,
current_index: 0,
target_name: name.to_string(),
}
}
fn collect_elements(element: &'a XmlElement, collection: &mut Vec<&'a XmlElement>) {
collection.push(element);
for child in &element.children {
Self::collect_elements(child, collection);
}
}
}
impl<'a> Iterator for SearchElementsByNameIter<'a> {
type Item = &'a XmlElement;
fn next(&mut self) -> Option<Self::Item> {
while self.current_index < self.elements.len() {
let element = self.elements[self.current_index];
self.current_index += 1;
if element.name == self.target_name {
return Some(element);
}
}
None
}
}
pub struct XmlReader {}
#[allow(unused)]
impl XmlReader {
pub fn parse(content: &str) -> Result<XmlElement, EpubError> {
if content.is_empty() {
return Err(EpubError::EmptyDataError);
}
let mut reader = NsReader::from_str(content);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
let mut stack = Vec::<XmlElement>::new();
let mut root = None;
let mut namespace_map = HashMap::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Eof) => break,
Ok(Event::Start(e)) => {
let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
let mut element = XmlElement::new(name);
if let Some(prefix) = e.name().prefix() {
element.prefix = Some(String::from_utf8_lossy(prefix.as_ref()).to_string());
}
for attr in e.attributes().flatten() {
let attr_key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
let attr_value = String::from_utf8_lossy(&attr.value).to_string();
if attr_key.contains("xmlns") {
let attr_keys = attr_key.split(":").collect::<Vec<&str>>();
if attr_keys.len() >= 2 {
namespace_map.insert(attr_keys[1].to_string(), attr_value);
} else {
namespace_map.insert(attr_key, attr_value);
}
continue;
}
element.attributes.insert(attr_key, attr_value);
}
stack.push(element);
}
Ok(Event::End(_)) => {
if let Some(element) = stack.pop() {
if stack.is_empty() {
root = Some(element);
} else if let Some(parent) = stack.last_mut() {
parent.children.push(element);
}
}
}
Ok(Event::Empty(e)) => {
let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
let mut element = XmlElement::new(name);
if let Some(prefix) = e.name().prefix() {
element.prefix = Some(String::from_utf8_lossy(prefix.as_ref()).to_string());
}
for attr in e.attributes().flatten() {
let attr_key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
let attr_value = String::from_utf8_lossy(&attr.value).to_string();
if attr_key.contains("xmlns") {
let attr_keys = attr_key.split(":").collect::<Vec<&str>>();
if attr_keys.len() >= 2 {
namespace_map.insert(attr_keys[1].to_string(), attr_value);
} else {
namespace_map.insert(attr_key, attr_value);
}
continue;
}
element.attributes.insert(attr_key, attr_value);
}
if let Some(parent) = stack.last_mut() {
parent.children.push(element);
}
}
Ok(Event::Text(e)) => {
if let Some(element) = stack.last_mut() {
let text = String::from_utf8_lossy(e.as_ref()).to_string();
if !text.trim().is_empty() {
element.text = Some(text);
}
}
}
Ok(Event::CData(e)) => {
if let Some(element) = stack.last_mut() {
element.cdata = Some(String::from_utf8_lossy(e.as_ref()).to_string());
}
}
Err(err) => return Err(err.into()),
_ => continue,
}
}
if let Some(element) = root.as_mut() {
Self::assign_namespace(element, &namespace_map);
}
root.ok_or(EpubError::EmptyDataError)
}
pub fn parse_bytes(bytes: Vec<u8>) -> Result<XmlElement, EpubError> {
let content = bytes.decode()?;
Self::parse(&content)
}
fn assign_namespace(element: &mut XmlElement, namespace_map: &HashMap<String, String>) {
if let Some(prefix) = &element.prefix {
if let Some(namespace) = namespace_map.get(prefix) {
element.namespace = Some(namespace.clone());
}
} else if let Some(namespace) = namespace_map.get("xmlns") {
element.namespace = Some(namespace.clone());
}
for chiled in element.children.iter_mut() {
Self::assign_namespace(chiled, namespace_map);
}
}
}
#[cfg(test)]
mod tests {
use crate::{
error::EpubError,
utils::{DecodeBytes, NormalizeWhitespace},
};
#[test]
fn test_decode_empty_data() {
let data = vec![];
let result = data.decode();
assert!(result.is_err());
assert_eq!(result.unwrap_err(), EpubError::EmptyDataError);
}
#[test]
fn test_decode_short_data() {
let data = vec![0xEF, 0xBB];
let result = data.decode();
assert!(result.is_err());
assert_eq!(result.unwrap_err(), EpubError::EmptyDataError);
}
#[test]
fn test_decode_utf8_with_bom() {
let data: Vec<u8> = vec![0xEF, 0xBB, 0xBF, b'H', b'e', b'l', b'l', b'o'];
let result = data.decode();
assert!(result.is_ok());
assert_eq!(result.unwrap(), "Hello");
}
#[test]
fn test_decode_utf16_be_with_bom() {
let data = vec![
0xFE, 0xFF, 0x00, b'H', 0x00, b'e', 0x00, b'l', 0x00, b'l', 0x00, b'o', ];
let result = data.decode();
assert!(result.is_ok());
assert_eq!(result.unwrap(), "Hello");
}
#[test]
fn test_decode_utf16_le_with_bom() {
let data = vec![
0xFF, 0xFE, b'H', 0x00, b'e', 0x00, b'l', 0x00, b'l', 0x00, b'o', 0x00, ];
let result = data.decode();
assert!(result.is_ok());
assert_eq!(result.unwrap(), "Hello");
}
#[test]
fn test_decode_plain_utf8() {
let data = b"Hello, World!".to_vec();
let result = data.decode();
assert!(result.is_ok());
assert_eq!(result.unwrap(), "Hello, World!");
}
#[test]
fn test_normalize_whitespace_trait() {
let text = " Hello,\tWorld!\n\nRust ";
let normalized = text.normalize_whitespace();
assert_eq!(normalized, "Hello, World! Rust");
let text_string = String::from(" Hello,\tWorld!\n\nRust ");
let normalized = text_string.normalize_whitespace();
assert_eq!(normalized, "Hello, World! Rust");
}
}