use bstr::ByteSlice;
use memchr::memmem;
use super::parser::DivElement;
#[inline]
pub fn parse_base32(s: &[u8]) -> usize {
let mut result = 0usize;
for &b in s {
result = result.wrapping_mul(32);
let val = match b {
b'0'..=b'9' => (b - b'0') as usize,
b'A'..=b'V' => (b - b'A') as usize + 10,
b'a'..=b'v' => (b - b'a') as usize + 10,
_ => continue,
};
result = result.wrapping_add(val);
}
result
}
struct KindleRef {
end: usize,
kind: RefKind,
}
enum RefKind {
Flow { flow_num: usize },
PosFid { elem_idx: usize, offset: usize },
PosFidOld { elem_idx: usize },
Embed { img_idx: usize, ext: &'static str },
Malformed,
}
pub fn transform_kindle_refs(
html: &[u8],
elems: &[DivElement],
raw_text: &[u8],
file_starts: &[(u32, u32)],
) -> Vec<u8> {
let mut output = Vec::with_capacity(html.len());
let mut pos = 0;
let finder = memmem::Finder::new(b"kindle:");
while let Some(rel_start) = finder.find(&html[pos..]) {
let start = pos + rel_start;
output.extend_from_slice(&html[pos..start]);
if let Some(kindle_ref) = parse_kindle_ref(&html[start..]) {
let replacement = generate_replacement(&kindle_ref, elems, raw_text, file_starts);
output.extend_from_slice(&replacement);
pos = start + kindle_ref.end;
} else {
output.extend_from_slice(b"kindle:");
pos = start + 7;
}
}
output.extend_from_slice(&html[pos..]);
output
}
fn parse_kindle_ref(data: &[u8]) -> Option<KindleRef> {
if !data.starts_with(b"kindle:") {
return None;
}
let end_pos = data[7..]
.iter()
.position(|&b| b == b'"' || b == b'\'' || b == b')')?;
let end = 7 + end_pos;
let content = &data[7..end];
let kind = if content.starts_with(b"flow:") {
let id_end = content[5..].find_byte(b'?').unwrap_or(content.len() - 5);
let flow_num = parse_base32(&content[5..5 + id_end]);
RefKind::Flow { flow_num }
} else if content.starts_with(b"pos:fid:") {
parse_pos_fid(content)
} else if content.starts_with(b"embed:") {
let id_end = content[6..].find_byte(b'?').unwrap_or(content.len() - 6);
let img_num = parse_base32(&content[6..6 + id_end]);
let img_idx = img_num.saturating_sub(1);
let ext = if content.find(b"image/png").is_some() {
"png"
} else if content.find(b"image/gif").is_some() {
"gif"
} else {
"jpg"
};
RefKind::Embed { img_idx, ext }
} else {
RefKind::Malformed
};
Some(KindleRef { end, kind })
}
fn parse_pos_fid(content: &[u8]) -> RefKind {
let rest = &content[8..];
let fid_end = rest.find_byte(b':').unwrap_or(rest.len());
let elem_idx = parse_base32(&rest[..fid_end]);
if fid_end < rest.len() && rest[fid_end..].starts_with(b":off:") {
let off_start = fid_end + 5;
let offset = parse_base32(&rest[off_start..]);
RefKind::PosFid { elem_idx, offset }
} else {
RefKind::PosFidOld { elem_idx }
}
}
fn generate_replacement(
kindle_ref: &KindleRef,
elems: &[DivElement],
raw_text: &[u8],
file_starts: &[(u32, u32)],
) -> Vec<u8> {
match &kindle_ref.kind {
RefKind::Flow { flow_num } => {
let css_idx = flow_num.saturating_sub(1);
format!("styles/style{:04}.css", css_idx).into_bytes()
}
RefKind::PosFid { elem_idx, offset } => {
let (file_num, target_pos) = if let Some(elem) = elems.get(*elem_idx) {
(elem.file_number as usize, elem.insert_pos + *offset as u32)
} else {
(0, 0)
};
let anchor = find_nearest_id_fast(raw_text, target_pos as usize, file_num, file_starts);
if let Some(id) = anchor {
format!("part{:04}.html#{}", file_num, id).into_bytes()
} else {
format!("part{:04}.html", file_num).into_bytes()
}
}
RefKind::PosFidOld { elem_idx } => {
let file_num = elems
.get(*elem_idx)
.map(|e| e.file_number as usize)
.unwrap_or(0);
format!("part{:04}.html", file_num).into_bytes()
}
RefKind::Embed { img_idx, ext } => {
format!("images/image_{:04}.{}", img_idx, ext).into_bytes()
}
RefKind::Malformed => Vec::new(),
}
}
pub fn find_nearest_id_fast(
raw_text: &[u8],
pos: usize,
file_num: usize,
file_starts: &[(u32, u32)],
) -> Option<String> {
let (file_start, file_end) = {
let mut start = 0usize;
let mut end = raw_text.len();
for (i, &(start_pos, fnum)) in file_starts.iter().enumerate() {
if fnum as usize == file_num {
start = start_pos as usize;
if let Some(&(next_start, _)) = file_starts.get(i + 1) {
end = next_start as usize;
}
break;
}
}
(start, end)
};
let pos = pos.clamp(file_start, file_end);
let id_finder = memmem::Finder::new(b" id=\"");
let id_finder_single = memmem::Finder::new(b" id='");
let name_finder = memmem::Finder::new(b" name=\"");
let name_finder_single = memmem::Finder::new(b" name='");
let aid_finder = memmem::Finder::new(b" aid=\"");
let aid_finder_single = memmem::Finder::new(b" aid='");
let end_pos = (pos + 2000).min(file_end);
if pos < end_pos {
let search_window = &raw_text[pos..end_pos];
let id_match = find_attr_with_pos(search_window, &id_finder, &id_finder_single, 4);
let name_match = find_attr_with_pos(search_window, &name_finder, &name_finder_single, 6);
let aid_match = find_attr_with_pos(search_window, &aid_finder, &aid_finder_single, 5);
let mut candidates: Vec<(usize, String, bool)> = Vec::new();
if let Some((p, v)) = id_match {
candidates.push((p, v, false));
}
if let Some((p, v)) = name_match {
candidates.push((p, v, false));
}
if let Some((p, v)) = aid_match {
candidates.push((p, v, true));
}
if let Some((_, val, is_aid)) = candidates.into_iter().min_by_key(|(p, _, _)| *p) {
if is_aid {
return Some(format!("aid-{}", val));
} else {
return Some(val);
}
}
}
let start_pos = pos.saturating_sub(2000).max(file_start);
if start_pos < pos {
let back_window = &raw_text[start_pos..pos];
let last_id = find_last_attr_in_window(back_window, &id_finder, &id_finder_single, 4);
let last_name = find_last_attr_in_window(back_window, &name_finder, &name_finder_single, 6);
let last_aid = find_last_attr_in_window(back_window, &aid_finder, &aid_finder_single, 5);
let body_pos = memmem::find(back_window, b"<body ");
let mut best: Option<(usize, String)> = None;
for (opt_pos, opt_val, is_aid) in [(last_id, false), (last_name, false), (last_aid, true)]
.into_iter()
.filter_map(|(opt, is_aid)| opt.map(|(p, v)| (p, v, is_aid)))
{
if let Some(bp) = body_pos
&& opt_pos < bp
{
continue;
}
let val = if is_aid {
format!("aid-{}", opt_val)
} else {
opt_val
};
match &best {
None => best = Some((opt_pos, val)),
Some((best_pos, _)) if opt_pos > *best_pos => best = Some((opt_pos, val)),
_ => {}
}
}
if let Some((_, val)) = best {
return Some(val);
}
}
None
}
fn find_attr_with_pos(
window: &[u8],
finder_double: &memmem::Finder,
finder_single: &memmem::Finder,
attr_len: usize, ) -> Option<(usize, String)> {
let pos = finder_double
.find(window)
.or_else(|| finder_single.find(window))?;
let quote_char = window[pos + attr_len];
let value_start = pos + attr_len + 1;
if let Some(value_end) = window[value_start..].iter().position(|&b| b == quote_char) {
let id_bytes = &window[value_start..value_start + value_end];
if id_bytes
.iter()
.all(|&b| b.is_ascii_alphanumeric() || b == b'-' || b == b'_' || b == b':' || b == b'.')
{
return Some((pos, String::from_utf8_lossy(id_bytes).into_owned()));
}
}
None
}
fn find_last_attr_in_window(
window: &[u8],
finder_double: &memmem::Finder,
finder_single: &memmem::Finder,
attr_len: usize,
) -> Option<(usize, String)> {
let mut last: Option<(usize, String)> = None;
let mut search_pos = 0;
while search_pos < window.len() {
let next = finder_double
.find(&window[search_pos..])
.or_else(|| finder_single.find(&window[search_pos..]));
if let Some(rel_pos) = next {
let abs_pos = search_pos + rel_pos;
let quote_char = window.get(abs_pos + attr_len).copied().unwrap_or(b'"');
let value_start = abs_pos + attr_len + 1;
if let Some(value_end) = window[value_start..].iter().position(|&b| b == quote_char) {
let id_bytes = &window[value_start..value_start + value_end];
if id_bytes.iter().all(|&b| {
b.is_ascii_alphanumeric() || b == b'-' || b == b'_' || b == b':' || b == b'.'
}) {
last = Some((abs_pos, String::from_utf8_lossy(id_bytes).into_owned()));
}
}
search_pos = abs_pos + 1;
} else {
break;
}
}
last
}
pub fn strip_kindle_attributes_fast(html: &[u8]) -> Vec<u8> {
let mut output = Vec::with_capacity(html.len());
let mut pos = 0;
while pos < html.len() {
if let Some(tag_start) = memchr::memchr(b'<', &html[pos..]) {
let abs_tag_start = pos + tag_start;
output.extend_from_slice(&html[pos..abs_tag_start]);
if let Some(tag_end) = memchr::memchr(b'>', &html[abs_tag_start..]) {
let abs_tag_end = abs_tag_start + tag_end + 1;
let tag = &html[abs_tag_start..abs_tag_end];
let cleaned = clean_tag(tag);
output.extend_from_slice(&cleaned);
pos = abs_tag_end;
} else {
output.extend_from_slice(&html[abs_tag_start..]);
break;
}
} else {
output.extend_from_slice(&html[pos..]);
break;
}
}
output
}
fn clean_tag(tag: &[u8]) -> Vec<u8> {
if tag.starts_with(b"<!--")
|| tag.starts_with(b"<!DOCTYPE")
|| tag.starts_with(b"<?")
|| tag.starts_with(b"</")
{
return tag.to_vec();
}
let mut result = Vec::with_capacity(tag.len());
let mut i = 0;
result.push(b'<');
i += 1;
while i < tag.len() && tag[i] != b' ' && tag[i] != b'>' && tag[i] != b'/' {
result.push(tag[i]);
i += 1;
}
while i < tag.len() {
while i < tag.len() && (tag[i] == b' ' || tag[i] == b'\t' || tag[i] == b'\n') {
result.push(tag[i]);
i += 1;
}
if i >= tag.len() || tag[i] == b'>' || tag[i] == b'/' {
break;
}
let attr_start = i;
while i < tag.len() && tag[i] != b'=' && tag[i] != b' ' && tag[i] != b'>' && tag[i] != b'/'
{
i += 1;
}
let attr_name = &tag[attr_start..i];
let should_strip = attr_name == b"aid"
|| attr_name.starts_with(b"data-Amzn")
|| attr_name.starts_with(b"data-amzn");
if should_strip {
if i < tag.len() && tag[i] == b'=' {
i += 1;
if i < tag.len() && (tag[i] == b'"' || tag[i] == b'\'') {
let quote = tag[i];
i += 1;
while i < tag.len() && tag[i] != quote {
i += 1;
}
if i < tag.len() {
i += 1;
}
} else {
while i < tag.len() && tag[i] != b' ' && tag[i] != b'>' {
i += 1;
}
}
}
} else {
result.extend_from_slice(attr_name);
if i < tag.len() && tag[i] == b'=' {
result.push(b'=');
i += 1;
if i < tag.len() && (tag[i] == b'"' || tag[i] == b'\'') {
let quote = tag[i];
result.push(quote);
i += 1;
let value_start = i;
while i < tag.len() && tag[i] != quote {
i += 1;
}
result.extend_from_slice(&tag[value_start..i]);
if i < tag.len() {
result.push(quote);
i += 1;
}
} else {
let value_start = i;
while i < tag.len() && tag[i] != b' ' && tag[i] != b'>' {
i += 1;
}
result.extend_from_slice(&tag[value_start..i]);
}
}
}
}
while i < tag.len() {
result.push(tag[i]);
i += 1;
}
if result.starts_with(b"<img ") || result.starts_with(b"<IMG ") {
return ensure_img_alt(&result);
}
result
}
fn ensure_img_alt(tag: &[u8]) -> Vec<u8> {
if memmem::find(tag, b"alt=").is_some() {
return tag.to_vec();
}
let mut result = Vec::with_capacity(tag.len() + 7);
if let Some(close_pos) = tag.iter().rposition(|&b| b == b'/' || b == b'>') {
result.extend_from_slice(&tag[..close_pos]);
if !result.ends_with(b" ") {
result.push(b' ');
}
result.extend_from_slice(b"alt=\"\"");
result.extend_from_slice(&tag[close_pos..]);
} else {
result.extend_from_slice(tag);
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_base32() {
assert_eq!(parse_base32(b"0000"), 0);
assert_eq!(parse_base32(b"0001"), 1);
assert_eq!(parse_base32(b"000V"), 31);
assert_eq!(parse_base32(b"0010"), 32);
}
#[test]
fn test_strip_aid_attribute() {
let input = b"<p aid=\"0001\">Hello</p>";
let output = strip_kindle_attributes_fast(input);
let output_str = String::from_utf8_lossy(&output);
eprintln!("Output: {:?}", output_str);
assert!(!output.contains_str("aid="));
assert!(
output_str.starts_with("<p") && output_str.contains(">Hello</p>"),
"Expected <p...>Hello</p>, got: {}",
output_str
);
}
#[test]
fn test_img_alt() {
let input = b"<img src=\"test.jpg\"/>";
let output = ensure_img_alt(input);
assert!(output.contains_str("alt=\"\""));
}
}