use std::collections::HashMap;
pub const AID_ABLE_TAGS: &[&str] = &[
"a",
"abbr",
"address",
"article",
"aside",
"audio",
"b",
"bdo",
"blockquote",
"body",
"button",
"cite",
"code",
"dd",
"del",
"details",
"dfn",
"div",
"dl",
"dt",
"em",
"fieldset",
"figcaption",
"figure",
"footer",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"header",
"hgroup",
"i",
"ins",
"kbd",
"label",
"legend",
"li",
"map",
"mark",
"meter",
"nav",
"ol",
"output",
"p",
"pre",
"progress",
"q",
"rp",
"rt",
"samp",
"section",
"select",
"small",
"span",
"strong",
"sub",
"summary",
"sup",
"textarea",
"time",
"ul",
"var",
"video",
];
#[derive(Debug, Clone)]
#[allow(dead_code)] pub struct Chunk {
pub raw: Vec<u8>,
pub insert_pos: usize,
pub starts_tags: Vec<String>,
pub ends_tags: Vec<String>,
pub selector: String,
pub file_number: usize,
pub sequence_number: usize,
pub start_pos: usize,
}
#[derive(Debug)]
pub struct Skeleton {
pub file_number: usize,
pub skeleton: Vec<u8>,
pub chunks: Vec<Chunk>,
pub start_pos: usize,
}
impl Skeleton {
pub fn len(&self) -> usize {
self.skeleton.len() + self.chunks.iter().map(|c| c.raw.len()).sum::<usize>()
}
pub fn raw_text(&self) -> Vec<u8> {
let mut result = self.skeleton.clone();
for chunk in &self.chunks {
result.extend_from_slice(&chunk.raw);
}
result
}
}
#[derive(Debug, Clone)]
#[allow(dead_code)] pub struct SkelEntry {
pub file_number: usize,
pub name: String,
pub chunk_count: usize,
pub start_pos: usize,
pub length: usize,
}
#[derive(Debug, Clone)]
pub struct ChunkEntry {
pub insert_pos: usize,
pub selector: String,
pub file_number: usize,
pub sequence_number: usize,
pub start_pos: usize,
pub length: usize,
}
#[allow(dead_code)] pub struct ChunkerResult {
pub skeletons: Vec<Skeleton>,
pub skel_table: Vec<SkelEntry>,
pub chunk_table: Vec<ChunkEntry>,
pub text: Vec<u8>,
pub id_map: HashMap<(String, String), String>,
pub aid_offset_map: HashMap<String, (usize, usize, usize)>,
pub filepos_map: HashMap<String, Vec<(usize, String)>>,
}
pub struct Chunker {
aid_counter: u32,
id_map: HashMap<(String, String), String>,
filepos_map: HashMap<String, Vec<(usize, String)>>,
}
impl Chunker {
pub fn new() -> Self {
Self {
aid_counter: 0,
id_map: HashMap::new(),
filepos_map: HashMap::new(),
}
}
pub fn process(&mut self, html_files: &[(String, Vec<u8>)]) -> ChunkerResult {
let mut skeletons = Vec::new();
let mut start_pos = 0;
for (i, (file_href, html)) in html_files.iter().enumerate() {
let skeleton = self.process_file(i, file_href, html, start_pos);
start_pos += skeleton.len();
skeletons.push(skeleton);
}
let skel_table: Vec<SkelEntry> = skeletons
.iter()
.map(|s| SkelEntry {
file_number: s.file_number,
name: format!("SKEL{:010}", s.file_number),
chunk_count: s.chunks.len(),
start_pos: s.start_pos,
length: s.skeleton.len(),
})
.collect();
let mut chunk_table = Vec::new();
let mut text_offset = 0usize;
let mut seq_num = 0usize;
for skel in &skeletons {
let skel_len = skel.skeleton.len();
if skel_len > 0 {
chunk_table.push(ChunkEntry {
insert_pos: text_offset,
selector: "P-//*[@aid='0000']".to_string(),
file_number: skel.file_number,
sequence_number: seq_num,
start_pos: 0,
length: skel_len,
});
seq_num += 1;
}
text_offset += skel_len;
}
let text: Vec<u8> = skeletons.iter().flat_map(|s| s.raw_text()).collect();
let aid_offset_map = self.build_aid_offset_map(&text, &chunk_table);
ChunkerResult {
skeletons,
skel_table,
chunk_table,
text,
id_map: std::mem::take(&mut self.id_map),
aid_offset_map,
filepos_map: std::mem::take(&mut self.filepos_map),
}
}
fn build_aid_offset_map(
&self,
text: &[u8],
chunk_table: &[ChunkEntry],
) -> HashMap<String, (usize, usize, usize)> {
use memchr::memmem;
let mut aid_offset_map = HashMap::new();
let finder = memmem::Finder::new(b" aid=\"");
let mut search_pos = 0;
while let Some(rel_pos) = finder.find(&text[search_pos..]) {
let offset = search_pos + rel_pos;
let val_start = offset + 6;
if val_start + 5 <= text.len() {
let aid_bytes = &text[val_start..val_start + 4];
let quote = text[val_start + 4];
if quote == b'"' {
let aid = String::from_utf8_lossy(aid_bytes).to_string();
let (seq_num, offset_in_chunk) = if chunk_table.is_empty() {
(0usize, offset)
} else {
let mut found_seq = 0usize;
let mut found_offset = offset;
for chunk in chunk_table {
let chunk_start = chunk.insert_pos;
let chunk_end = chunk_start + chunk.length;
if offset >= chunk_start && offset < chunk_end {
found_seq = chunk.sequence_number;
found_offset = offset - chunk_start;
break;
}
}
(found_seq, found_offset)
};
aid_offset_map.insert(aid, (seq_num, offset_in_chunk, offset));
}
}
search_pos = val_start;
}
aid_offset_map
}
fn process_file(
&mut self,
file_number: usize,
file_href: &str,
html: &[u8],
start_pos: usize,
) -> Skeleton {
let result = super::writer_transform::add_aid_attributes_fast(
html,
file_href,
&mut self.aid_counter,
&mut self.id_map,
);
if !result.position_map.is_empty() {
self.filepos_map
.insert(file_href.to_string(), result.position_map);
}
Skeleton {
file_number,
skeleton: result.html,
chunks: Vec::new(), start_pos,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
pub(super) fn from_base32(s: &str) -> u32 {
let mut result = 0u32;
for c in s.chars() {
result = result.saturating_mul(32);
let val = match c {
'0'..='9' => c as u32 - '0' as u32,
'A'..='V' => c as u32 - 'A' as u32 + 10,
'a'..='v' => c as u32 - 'a' as u32 + 10,
_ => continue,
};
result = result.saturating_add(val);
}
result
}
#[test]
fn test_from_base32() {
assert_eq!(from_base32("0000"), 0);
assert_eq!(from_base32("0001"), 1);
assert_eq!(from_base32("000V"), 31);
assert_eq!(from_base32("0010"), 32);
}
#[test]
fn test_add_aids() {
use crate::mobi::writer_transform::add_aid_attributes_fast;
let mut chunker = Chunker::new();
let html = b"<html><body><p>Hello</p><div>World</div></body></html>";
let result = add_aid_attributes_fast(
html,
"test.xhtml",
&mut chunker.aid_counter,
&mut chunker.id_map,
);
let result_str = String::from_utf8_lossy(&result.html);
assert!(result_str.contains("aid=\"0000\""));
assert!(result_str.contains("aid=\"0001\""));
}
}