use bstr::ByteSlice;
use memchr::memmem;
use std::collections::HashMap;
#[inline]
pub fn write_base32_4(num: usize, buf: &mut [u8; 4]) {
const DIGITS: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUV";
buf[3] = DIGITS[num % 32];
buf[2] = DIGITS[(num / 32) % 32];
buf[1] = DIGITS[(num / 1024) % 32];
buf[0] = DIGITS[(num / 32768) % 32];
}
#[inline]
pub fn write_base32_10(num: usize, buf: &mut [u8; 10]) {
const DIGITS: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUV";
let mut n = num;
for i in (0..10).rev() {
buf[i] = DIGITS[n % 32];
n /= 32;
}
}
pub struct CollectedLink {
pub target_file: String,
pub fragment: String,
}
pub struct RewriteResult {
pub html: Vec<u8>,
pub links: Vec<CollectedLink>,
}
pub fn rewrite_html_references_fast(
html: &[u8],
html_href: &str,
css_flow_map: &HashMap<String, usize>,
resource_map: &HashMap<String, usize>,
spine_hrefs: &std::collections::HashSet<&str>,
book_resources: &HashMap<String, crate::book::Resource>,
link_counter_start: usize,
) -> RewriteResult {
let mut output = Vec::with_capacity(html.len() + html.len() / 10); let mut links = Vec::new();
let mut link_counter = link_counter_start;
let mut pos = 0;
let base_dir = std::path::Path::new(html_href)
.parent()
.map(|p| p.to_string_lossy().to_string())
.unwrap_or_default();
let link_finder = memmem::Finder::new(b"<link ");
let img_finder = memmem::Finder::new(b"<img ");
let a_finder = memmem::Finder::new(b"<a ");
while pos < html.len() {
let next_link = link_finder.find(&html[pos..]).map(|p| p + pos);
let next_img = img_finder.find(&html[pos..]).map(|p| p + pos);
let next_a = a_finder.find(&html[pos..]).map(|p| p + pos);
let next_match = [next_link, next_img, next_a].into_iter().flatten().min();
if let Some(tag_start) = next_match {
output.extend_from_slice(&html[pos..tag_start]);
if let Some(tag_end_rel) = memchr::memchr(b'>', &html[tag_start..]) {
let tag_end = tag_start + tag_end_rel + 1;
let tag = &html[tag_start..tag_end];
if tag.starts_with(b"<link ") {
process_link_tag(tag, &base_dir, css_flow_map, &mut output);
} else if tag.starts_with(b"<img ") {
process_img_tag(tag, &base_dir, resource_map, book_resources, &mut output);
} else if tag.starts_with(b"<a ") {
process_anchor_tag(
tag,
html_href,
&base_dir,
spine_hrefs,
&mut output,
&mut links,
&mut link_counter,
);
} else {
output.extend_from_slice(tag);
}
pos = tag_end;
} else {
output.extend_from_slice(&html[pos..]);
break;
}
} else {
output.extend_from_slice(&html[pos..]);
break;
}
}
RewriteResult {
html: output,
links,
}
}
fn process_link_tag(
tag: &[u8],
base_dir: &str,
css_flow_map: &HashMap<String, usize>,
output: &mut Vec<u8>,
) {
if let Some(href_value) = extract_attribute_value(tag, b"href") {
let href_str = String::from_utf8_lossy(href_value);
let resolved = resolve_href(base_dir, &href_str);
if let Some(&flow_idx) = css_flow_map.get(&resolved) {
let mut base32_buf = [0u8; 4];
write_base32_4(flow_idx, &mut base32_buf);
output.extend_from_slice(b"<link ");
let mut wrote_href = false;
for (name, value) in AttributeIter::new(tag) {
if name == b"href" {
output.extend_from_slice(b"href=\"kindle:flow:");
output.extend_from_slice(&base32_buf);
output.extend_from_slice(b"?mime=text/css\" ");
wrote_href = true;
} else {
output.extend_from_slice(name);
output.extend_from_slice(b"=\"");
output.extend_from_slice(value);
output.extend_from_slice(b"\" ");
}
}
if !wrote_href {
output.extend_from_slice(b"href=\"kindle:flow:");
output.extend_from_slice(&base32_buf);
output.extend_from_slice(b"?mime=text/css\" ");
}
if tag.ends_with(b"/>") {
output.extend_from_slice(b"/>");
} else {
output.push(b'>');
}
return;
}
}
output.extend_from_slice(tag);
}
fn process_img_tag(
tag: &[u8],
base_dir: &str,
resource_map: &HashMap<String, usize>,
book_resources: &HashMap<String, crate::book::Resource>,
output: &mut Vec<u8>,
) {
if let Some(src_value) = extract_attribute_value(tag, b"src") {
let src_str = String::from_utf8_lossy(src_value);
let resolved = resolve_href(base_dir, &src_str);
if let Some(&res_idx) = resource_map.get(&resolved) {
let mime = book_resources
.get(&resolved)
.map(|r| r.media_type.as_str())
.unwrap_or("image/jpeg");
let mut base32_buf = [0u8; 4];
write_base32_4(res_idx, &mut base32_buf);
output.extend_from_slice(b"<img ");
for (name, value) in AttributeIter::new(tag) {
if name == b"src" {
output.extend_from_slice(b"src=\"kindle:embed:");
output.extend_from_slice(&base32_buf);
output.extend_from_slice(b"?mime=");
output.extend_from_slice(mime.as_bytes());
output.extend_from_slice(b"\" ");
} else {
output.extend_from_slice(name);
output.extend_from_slice(b"=\"");
output.extend_from_slice(value);
output.extend_from_slice(b"\" ");
}
}
if tag.ends_with(b"/>") {
output.extend_from_slice(b"/>");
} else {
output.push(b'>');
}
return;
}
}
output.extend_from_slice(tag);
}
fn process_anchor_tag(
tag: &[u8],
html_href: &str,
base_dir: &str,
spine_hrefs: &std::collections::HashSet<&str>,
output: &mut Vec<u8>,
links: &mut Vec<CollectedLink>,
link_counter: &mut usize,
) {
if let Some(href_value) = extract_attribute_value(tag, b"href") {
let href_str = String::from_utf8_lossy(href_value);
if href_str.starts_with("http")
|| href_str.starts_with("mailto:")
|| href_str.starts_with("kindle:")
{
output.extend_from_slice(tag);
return;
}
let (target_file, fragment) = if let Some(hash_pos) = href_str.find('#') {
let file_part = &href_str[..hash_pos];
let frag_part = &href_str[hash_pos + 1..];
if file_part.is_empty() {
(html_href.to_string(), frag_part.to_string())
} else {
(resolve_href(base_dir, file_part), frag_part.to_string())
}
} else {
(resolve_href(base_dir, &href_str), String::new())
};
if spine_hrefs.contains(target_file.as_str()) {
*link_counter += 1;
let mut base32_buf = [0u8; 10];
write_base32_10(*link_counter, &mut base32_buf);
output.extend_from_slice(b"<a ");
for (name, value) in AttributeIter::new(tag) {
if name == b"href" {
output.extend_from_slice(b"href=\"kindle:pos:fid:0000:off:");
output.extend_from_slice(&base32_buf);
output.extend_from_slice(b"\" ");
} else {
output.extend_from_slice(name);
output.extend_from_slice(b"=\"");
output.extend_from_slice(value);
output.extend_from_slice(b"\" ");
}
}
output.push(b'>');
links.push(CollectedLink {
target_file,
fragment,
});
return;
}
}
output.extend_from_slice(tag);
}
fn extract_attribute_value<'a>(tag: &'a [u8], attr_name: &[u8]) -> Option<&'a [u8]> {
for (name, value) in AttributeIter::new(tag) {
if name.eq_ignore_ascii_case(attr_name) {
return Some(value);
}
}
None
}
struct AttributeIter<'a> {
data: &'a [u8],
pos: usize,
}
impl<'a> AttributeIter<'a> {
fn new(tag: &'a [u8]) -> Self {
let mut pos = 0;
while pos < tag.len() && tag[pos] != b' ' && tag[pos] != b'>' && tag[pos] != b'/' {
pos += 1;
}
Self { data: tag, pos }
}
}
impl<'a> Iterator for AttributeIter<'a> {
type Item = (&'a [u8], &'a [u8]);
fn next(&mut self) -> Option<Self::Item> {
while self.pos < self.data.len() && self.data[self.pos] == b' ' {
self.pos += 1;
}
if self.pos >= self.data.len() || self.data[self.pos] == b'>' || self.data[self.pos] == b'/'
{
return None;
}
let name_start = self.pos;
while self.pos < self.data.len()
&& self.data[self.pos] != b'='
&& self.data[self.pos] != b' '
&& self.data[self.pos] != b'>'
&& self.data[self.pos] != b'/'
{
self.pos += 1;
}
let name = &self.data[name_start..self.pos];
if self.pos < self.data.len() && self.data[self.pos] == b'=' {
self.pos += 1;
if self.pos < self.data.len()
&& (self.data[self.pos] == b'"' || self.data[self.pos] == b'\'')
{
let quote = self.data[self.pos];
self.pos += 1;
let value_start = self.pos;
while self.pos < self.data.len() && self.data[self.pos] != quote {
self.pos += 1;
}
let value = &self.data[value_start..self.pos];
if self.pos < self.data.len() {
self.pos += 1; }
return Some((name, value));
}
}
Some((name, &[]))
}
}
fn resolve_href(base_dir: &str, href: &str) -> String {
if href.starts_with('/') {
return href.trim_start_matches('/').to_string();
}
let mut parts: Vec<&str> = if base_dir.is_empty() {
Vec::new()
} else {
base_dir.split('/').collect()
};
for segment in href.split('/') {
match segment {
".." => {
parts.pop();
}
"." | "" => {}
s => parts.push(s),
}
}
parts.join("/")
}
pub fn rewrite_css_references_fast(css: &[u8], resource_map: &HashMap<String, usize>) -> Vec<u8> {
let mut output = Vec::with_capacity(css.len());
let mut pos = 0;
let url_finder = memmem::Finder::new(b"url(");
while let Some(url_start) = url_finder.find(&css[pos..]) {
let abs_start = pos + url_start;
output.extend_from_slice(&css[pos..abs_start]);
let content_start = abs_start + 4;
if let Some(paren_end) = css[content_start..].find_byte(b')') {
let url_content = &css[content_start..content_start + paren_end];
let url = url_content.trim_with(|c| c == '"' || c == '\'' || c == ' ');
if url.starts_with(b"data:") || url.starts_with(b"http") {
output.extend_from_slice(&css[abs_start..content_start + paren_end + 1]);
} else {
let url_str = String::from_utf8_lossy(url);
let normalized = url_str.trim_start_matches("../").trim_start_matches("./");
let mut found = false;
for (href, &res_idx) in resource_map {
if href.ends_with(normalized) || href == normalized {
let mut base32_buf = [0u8; 4];
write_base32_4(res_idx, &mut base32_buf);
output.extend_from_slice(b"url(kindle:embed:");
output.extend_from_slice(&base32_buf);
output.push(b')');
found = true;
break;
}
}
if !found {
output.extend_from_slice(&css[abs_start..content_start + paren_end + 1]);
}
}
pos = content_start + paren_end + 1;
} else {
output.extend_from_slice(&css[pos..]);
break;
}
}
output.extend_from_slice(&css[pos..]);
output
}
pub struct AidInsertResult {
pub html: Vec<u8>,
pub position_map: Vec<(usize, String)>,
}
pub fn add_aid_attributes_fast(
html: &[u8],
file_href: &str,
aid_counter: &mut u32,
id_map: &mut HashMap<(String, String), String>,
) -> AidInsertResult {
use super::skeleton::AID_ABLE_TAGS;
let mut output = Vec::with_capacity(html.len() + html.len() / 5); let mut position_map = Vec::new();
let mut pos = 0;
while pos < html.len() {
if let Some(tag_start_rel) = memchr::memchr(b'<', &html[pos..]) {
let tag_start = pos + tag_start_rel;
output.extend_from_slice(&html[pos..tag_start]);
if html.get(tag_start + 1) == Some(&b'/')
|| html.get(tag_start + 1) == Some(&b'!')
|| html.get(tag_start + 1) == Some(&b'?')
{
if let Some(tag_end_rel) = memchr::memchr(b'>', &html[tag_start..]) {
let tag_end = tag_start + tag_end_rel + 1;
output.extend_from_slice(&html[tag_start..tag_end]);
pos = tag_end;
} else {
output.extend_from_slice(&html[tag_start..]);
break;
}
continue;
}
let mut name_end = tag_start + 1;
while name_end < html.len()
&& html[name_end] != b' '
&& html[name_end] != b'\t'
&& html[name_end] != b'\n'
&& html[name_end] != b'\r'
&& html[name_end] != b'\x0c'
&& html[name_end] != b'>'
&& html[name_end] != b'/'
{
name_end += 1;
}
let tag_name = &html[tag_start + 1..name_end];
let tag_name_lower = tag_name.to_ascii_lowercase();
let is_aidable = AID_ABLE_TAGS
.iter()
.any(|&t| t.as_bytes() == tag_name_lower.as_slice());
if let Some(tag_end_rel) = memchr::memchr(b'>', &html[tag_start..]) {
let tag_end = tag_start + tag_end_rel + 1;
let tag = &html[tag_start..tag_end];
if is_aidable && tag.find(b"aid=").is_none() {
let mut aid_buf = [0u8; 4];
write_base32_4(*aid_counter as usize, &mut aid_buf);
let aid_str = std::str::from_utf8(&aid_buf).unwrap().to_string();
*aid_counter += 1;
position_map.push((tag_start, aid_str.clone()));
if let Some(id_value) = extract_attribute_value(tag, b"id") {
let id_str = String::from_utf8_lossy(id_value).to_string();
id_map.insert((file_href.to_string(), id_str), aid_str.clone());
}
if tag_name_lower == b"body" {
id_map.insert((file_href.to_string(), String::new()), aid_str.clone());
}
output.push(b'<');
output.extend_from_slice(tag_name);
let is_self_closing = tag.ends_with(b"/>");
let attr_end = if is_self_closing {
tag_end - 2 } else {
tag_end - 1 };
if name_end < attr_end {
output.extend_from_slice(&html[name_end..attr_end]);
output.extend_from_slice(b" aid=\"");
} else {
output.extend_from_slice(b" aid=\"");
}
output.extend_from_slice(&aid_buf);
output.push(b'"');
if is_self_closing {
output.extend_from_slice(b"/>");
} else {
output.push(b'>');
}
} else {
output.extend_from_slice(tag);
}
pos = tag_end;
} else {
output.extend_from_slice(&html[tag_start..]);
break;
}
} else {
output.extend_from_slice(&html[pos..]);
break;
}
}
AidInsertResult {
html: output,
position_map,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_write_base32_4() {
let mut buf = [0u8; 4];
write_base32_4(0, &mut buf);
assert_eq!(&buf, b"0000");
write_base32_4(1, &mut buf);
assert_eq!(&buf, b"0001");
write_base32_4(32, &mut buf);
assert_eq!(&buf, b"0010");
}
#[test]
fn test_attribute_iter() {
let tag = b"<img src=\"test.jpg\" alt=\"hello\" />";
let attrs: Vec<_> = AttributeIter::new(tag).collect();
assert_eq!(attrs.len(), 2);
assert_eq!(attrs[0].0, b"src");
assert_eq!(attrs[0].1, b"test.jpg");
}
#[test]
fn test_resolve_href() {
assert_eq!(
resolve_href("chapter", "../images/test.jpg"),
"images/test.jpg"
);
assert_eq!(resolve_href("", "test.html"), "test.html");
}
#[test]
fn test_add_aid_self_closing() {
use bstr::ByteSlice;
let html = b"<a id=\"tp\"/>";
let mut aid_counter = 0u32;
let mut id_map = HashMap::new();
let result = add_aid_attributes_fast(html, "test.html", &mut aid_counter, &mut id_map);
let result_str = String::from_utf8_lossy(&result.html);
assert!(
result_str.contains("id=\"tp\" aid="),
"aid should come after id, not after /: {result_str}"
);
assert!(
!result_str.contains("\"/ aid"),
"should not have stray / before aid: {result_str}"
);
assert!(
result.html.ends_with_str(b"/>"),
"should end with />: {result_str}"
);
}
}