use std::collections::HashSet;
use std::path::Path;
use std::time::{SystemTime, UNIX_EPOCH};
use regex::Regex;
use crate::exth;
use crate::extracted::ExtractedEpub;
use crate::html_check;
use crate::indx::{self, LookupTerm};
use crate::opf::{self, DictionaryEntry, OPFData};
use crate::palmdoc;
const RECORD_SIZE: usize = 4096;
const MOBI_HEADER_LENGTH: usize = 264;
#[allow(clippy::too_many_arguments)]
pub fn build_mobi(
opf_path: &Path,
output_path: &Path,
no_compress: bool,
headwords_only: bool,
srcs_data: Option<&[u8]>,
include_cmet: bool,
no_hd_images: bool,
creator_tag: bool,
kf8_only: bool,
doc_type: Option<&str>,
kindle_limits: bool,
self_check: bool,
kindlegen_parity: bool,
strict_accents: bool,
) -> Result<(), Box<dyn std::error::Error>> {
let extracted = ExtractedEpub::from_opf_path(opf_path)?;
build_mobi_from_extracted(
&extracted,
output_path,
no_compress,
headwords_only,
srcs_data,
include_cmet,
no_hd_images,
creator_tag,
kf8_only,
doc_type,
kindle_limits,
self_check,
kindlegen_parity,
strict_accents,
)
}
#[allow(clippy::too_many_arguments)]
pub fn build_mobi_from_extracted(
extracted: &ExtractedEpub,
output_path: &Path,
no_compress: bool,
headwords_only: bool,
srcs_data: Option<&[u8]>,
include_cmet: bool,
no_hd_images: bool,
creator_tag: bool,
kf8_only: bool,
doc_type: Option<&str>,
kindle_limits: bool,
self_check: bool,
kindlegen_parity: bool,
strict_accents: bool,
) -> Result<(), Box<dyn std::error::Error>> {
let opf = &extracted.opf;
let is_dictionary = detect_dictionary(opf);
if is_dictionary {
if kf8_only {
return Err("KF8-only output is not supported for dictionaries (dictionaries use MOBI7 format)".into());
}
eprintln!("Detected dictionary content");
let _ = kindlegen_parity;
build_dictionary_mobi(opf, output_path, no_compress, headwords_only, srcs_data, include_cmet, creator_tag, kindle_limits, self_check, strict_accents)
} else {
if kf8_only {
eprintln!("Detected book content, building KF8-only (.azw3)");
} else {
eprintln!("Detected book content (no idx:entry tags found)");
}
let _ = strict_accents;
build_book_mobi(opf, output_path, no_compress, srcs_data, include_cmet, !no_hd_images, creator_tag, kf8_only, doc_type, kindle_limits, self_check, kindlegen_parity)
}
}
fn detect_dictionary(opf: &OPFData) -> bool {
for html_path in opf.get_content_html_paths() {
if let Ok(content) = std::fs::read_to_string(&html_path) {
if content.contains("<idx:entry") {
return true;
}
}
}
false
}
const KINDLE_HTML_SIZE_LIMIT: usize = 30 * 1024 * 1024;
const KINDLE_HTML_FILE_LIMIT: usize = 300;
#[allow(clippy::too_many_arguments)]
fn build_dictionary_mobi(
opf: &OPFData,
output_path: &Path,
no_compress: bool,
headwords_only: bool,
srcs_data: Option<&[u8]>,
include_cmet: bool,
creator_tag: bool,
kindle_limits: bool,
self_check: bool,
strict_accents: bool,
) -> Result<(), Box<dyn std::error::Error>> {
let mut all_entries: Vec<DictionaryEntry> = Vec::new();
for html_path in opf.get_content_html_paths() {
let entries = opf::parse_dictionary_html(&html_path)?;
all_entries.extend(entries);
}
if all_entries.is_empty() {
return Err("No dictionary entries found in HTML content files".into());
}
eprintln!("Parsed {} dictionary entries", all_entries.len());
let mut image_items = opf.get_image_items(); let extras = opf.find_unreferenced_images();
if !extras.is_empty() {
eprintln!(
"Info: {} image(s) referenced by HTML but missing from manifest — embedding anyway",
extras.len()
);
image_items.extend(extras);
}
let cover_href = opf.get_cover_image_href();
let mut image_records: Vec<Vec<u8>> = Vec::new();
let mut href_to_recindex: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
let mut cover_offset: Option<u32> = None;
let mut total_image_bytes: usize = 0;
for (idx, (href, _media_type)) in image_items.iter().enumerate() {
let recindex = idx + 1; let image_path = opf.base_dir.join(href);
let data = std::fs::read(&image_path).or_else(|_| {
let decoded = percent_decode(href);
std::fs::read(opf.base_dir.join(&decoded))
});
if let Ok(mut data) = data {
if data.len() > 13
&& data[0] == 0xFF && data[1] == 0xD8
&& data[2] == 0xFF && data[3] == 0xE0
&& data[6..11] == *b"JFIF\0"
&& data[13] == 0x00
{
data[13] = 0x01;
}
total_image_bytes += data.len();
href_to_recindex.insert(href.clone(), recindex);
image_records.push(data);
if let Some(ref cover) = cover_href {
if href == cover {
cover_offset = Some(idx as u32);
}
}
} else {
eprintln!("Warning: could not read image file: {}", image_path.display());
href_to_recindex.insert(href.clone(), recindex);
image_records.push(Vec::new());
}
}
if !image_records.is_empty() {
eprintln!(
"Collected {} images ({} bytes total)",
image_records.len(),
total_image_bytes
);
}
let num_image_records = image_records.len();
eprintln!("Building text content...");
let text_content = if kindle_limits {
build_text_content_by_letter(&opf, &all_entries)
} else {
build_text_content(&opf, true)
};
let text_content = insert_guide_reference(&text_content);
let text_content = if !href_to_recindex.is_empty() {
rewrite_image_src(&text_content, &href_to_recindex, &opf.spine_items)
} else {
text_content
};
let text_content = pad_text_for_chunking(&text_content, RECORD_SIZE);
if self_check {
let issues = html_check::validate_text_blob(&text_content);
if !issues.is_empty() {
html_check::print_self_check_warnings(&issues);
}
let chunk_size = compute_chunk_size(text_content.len());
let ranges = split_on_utf8_boundaries(&text_content, chunk_size);
let record_issues = html_check::validate_records(&text_content, &ranges, 20);
if !record_issues.is_empty() {
html_check::print_self_check_warnings(&record_issues);
}
}
let (text_records, text_length) = if no_compress {
eprintln!("Splitting text into uncompressed records...");
let result = split_text_uncompressed(&text_content);
eprintln!(
"Split text into {} uncompressed records ({} bytes)",
result.0.len(),
result.1
);
result
} else {
eprintln!("Compressing text...");
let result = compress_text(&text_content);
eprintln!(
"Compressed text into {} records ({} bytes uncompressed)",
result.0.len(),
result.1
);
result
};
eprintln!("Finding entry positions...");
let entry_positions = find_entry_positions(&text_content, &all_entries);
eprintln!("Building lookup terms...");
let lookup_terms = build_lookup_terms(
&all_entries,
&entry_positions,
&text_content,
headwords_only,
);
eprintln!("Building INDX records...");
let mut headword_chars_for_indx: HashSet<char> = HashSet::new();
for entry in &all_entries {
for c in entry.headword.chars() {
if c as u32 > 0x7F {
headword_chars_for_indx.insert(c);
}
}
}
let indx_records = indx::build_orth_indx(&lookup_terms, &headword_chars_for_indx, strict_accents);
eprintln!(" Orth INDX: {} records", indx_records.len());
let flis = build_flis();
let fcis = build_fcis(text_length, 1); let eof = build_eof();
let srcs_record: Option<Vec<u8>> = srcs_data.map(|data| {
let mut rec = Vec::with_capacity(16 + data.len());
rec.extend_from_slice(b"SRCS");
rec.extend_from_slice(&0x10u32.to_be_bytes()); rec.extend_from_slice(&(data.len() as u32).to_be_bytes());
rec.extend_from_slice(&1u32.to_be_bytes());
rec.extend_from_slice(data);
rec
});
let cmet_record: Option<Vec<u8>> = if include_cmet {
Some(build_cmet())
} else {
None
};
let num_optional = srcs_record.as_ref().map_or(0, |_| 1) + cmet_record.as_ref().map_or(0, |_| 1);
let first_non_book = text_records.len() + 1;
let first_image_record = if num_image_records > 0 {
text_records.len() + 1
} else {
0xFFFFFFFF
};
let orth_index_record = text_records.len() + 1 + num_image_records;
let infl_index_record = 0xFFFFFFFFusize;
let flis_record = orth_index_record + indx_records.len();
let fcis_record = flis_record + 1;
let srcs_record_idx = if srcs_record.is_some() {
Some(fcis_record + 1)
} else {
None
};
let total_records = 1 + text_records.len() + num_image_records
+ indx_records.len() + 3 + num_optional;
let mut headword_chars: HashSet<u32> = HashSet::new();
for entry in &all_entries {
for c in entry.headword.chars() {
headword_chars.insert(c as u32);
}
}
let record0 = build_record0(
&opf,
text_length,
text_records.len(),
first_non_book,
orth_index_record,
infl_index_record,
total_records,
flis_record,
fcis_record,
no_compress,
&headword_chars,
true, first_image_record,
cover_offset,
None, None, None, None, None, srcs_record_idx,
None, creator_tag,
None, );
let mut all_records = vec![record0];
all_records.extend(text_records);
all_records.extend(image_records);
all_records.extend(indx_records);
all_records.push(flis);
all_records.push(fcis);
if let Some(srcs) = srcs_record {
all_records.push(srcs);
}
if let Some(cmet) = cmet_record {
all_records.push(cmet);
}
all_records.push(eof);
let title = if opf.title.is_empty() {
"Dictionary"
} else {
&opf.title
};
let palmdb = build_palmdb(title, &all_records);
std::fs::write(output_path, &palmdb)?;
eprintln!("Wrote {} ({} bytes)", output_path.display(), palmdb.len());
Ok(())
}
fn build_book_mobi(
opf: &OPFData,
output_path: &Path,
no_compress: bool,
srcs_data: Option<&[u8]>,
include_cmet: bool,
hd_images: bool,
creator_tag: bool,
kf8_only: bool,
doc_type: Option<&str>,
kindle_limits: bool,
self_check: bool,
kindlegen_parity: bool,
) -> Result<(), Box<dyn std::error::Error>> {
let mut image_items = opf.get_image_items(); let extras = opf.find_unreferenced_images();
if !extras.is_empty() {
eprintln!(
"Info: {} image(s) referenced by HTML but missing from manifest — embedding anyway",
extras.len()
);
image_items.extend(extras);
}
let cover_href = opf.get_cover_image_href();
let mut image_records: Vec<Vec<u8>> = Vec::new();
let mut href_to_recindex: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
let mut cover_offset: Option<u32> = None;
let mut total_image_bytes: usize = 0;
for (idx, (href, _media_type)) in image_items.iter().enumerate() {
let recindex = idx + 1; let image_path = opf.base_dir.join(href);
let data = std::fs::read(&image_path).or_else(|_| {
let decoded = percent_decode(href);
std::fs::read(opf.base_dir.join(&decoded))
});
if let Ok(mut data) = data {
if data.len() > 13
&& data[0] == 0xFF && data[1] == 0xD8 && data[2] == 0xFF && data[3] == 0xE0 && data[6..11] == *b"JFIF\0" && data[13] == 0x00 {
data[13] = 0x01; }
total_image_bytes += data.len();
href_to_recindex.insert(href.clone(), recindex);
image_records.push(data);
if let Some(ref cover) = cover_href {
if href == cover {
cover_offset = Some(idx as u32); }
}
} else {
eprintln!("Warning: could not read image file: {}", image_path.display());
href_to_recindex.insert(href.clone(), recindex);
image_records.push(Vec::new());
}
}
if !image_records.is_empty() {
eprintln!(
"Collected {} images ({} bytes total)",
image_records.len(),
total_image_bytes
);
}
let (thumb_offset, thumb_hd_skip): (Option<u32>, std::collections::HashSet<usize>) = {
let mut hd_skip: std::collections::HashSet<usize> = std::collections::HashSet::new();
let thumb_off = match cover_offset {
Some(cov) => {
let cov_idx = cov as usize;
let cover_bytes = image_records.get(cov_idx).cloned().unwrap_or_default();
if cover_bytes.is_empty() {
None
} else if let Some(thumb) = build_thumbnail_record(&cover_bytes) {
let thumb_idx = image_records.len();
let thumb_len = thumb.len();
image_records.push(thumb);
let _ = total_image_bytes; hd_skip.insert(thumb_idx);
eprintln!(
"Generated {} byte library thumbnail (recindex {}, EXTH 202={})",
thumb_len,
thumb_idx + 1,
thumb_idx,
);
Some(thumb_idx as u32)
} else {
eprintln!(
"Warning: could not decode cover image to generate thumbnail; library tile will fall back to the cover"
);
None
}
}
None => None,
};
(thumb_off, hd_skip)
};
let kf8_cover_uri: Option<String> = cover_offset.map(|off| {
let recindex = (off as usize) + 1; format!("kindle:embed:{}", encode_kindle_embed_base32(recindex))
});
eprintln!("Building KF8 section...");
let html_parts = build_html_parts(opf);
if kindle_limits {
let num_html_files = html_parts.len();
if num_html_files > KINDLE_HTML_FILE_LIMIT {
eprintln!(
"Warning: {} HTML files exceeds the Kindle limit of {} files",
num_html_files, KINDLE_HTML_FILE_LIMIT
);
}
for (i, part) in html_parts.iter().enumerate() {
let part_size = part.len();
if part_size > KINDLE_HTML_SIZE_LIMIT {
eprintln!(
"Warning: HTML part {} is {} bytes, exceeds 30 MB Kindle limit ({} bytes)",
i + 1, part_size, KINDLE_HTML_SIZE_LIMIT
);
}
}
}
let css_content = extract_css_content(opf);
let kf8_title = if opf.title.is_empty() { "Book" } else { &opf.title };
let kf8_section = crate::kf8::build_kf8_section(
&html_parts,
&css_content,
&href_to_recindex,
&opf.spine_items,
no_compress,
kindlegen_parity,
kf8_title,
);
eprintln!(
"KF8: {} text records ({} bytes), {} flows",
kf8_section.text_records.len(),
kf8_section.text_length,
kf8_section.flow_count,
);
if self_check {
let issues = html_check::validate_text_blob(&kf8_section.html_bytes);
if !issues.is_empty() {
html_check::print_self_check_warnings(&issues);
}
}
let srcs_record: Option<Vec<u8>> = srcs_data.map(|data| {
let mut rec = Vec::with_capacity(16 + data.len());
rec.extend_from_slice(b"SRCS");
rec.extend_from_slice(&0x10u32.to_be_bytes());
rec.extend_from_slice(&(data.len() as u32).to_be_bytes());
rec.extend_from_slice(&1u32.to_be_bytes());
rec.extend_from_slice(data);
rec
});
let cmet_record: Option<Vec<u8>> = if include_cmet {
Some(build_cmet())
} else {
None
};
let num_optional = srcs_record.as_ref().map_or(0, |_| 1) + cmet_record.as_ref().map_or(0, |_| 1);
let num_image_records = image_records.len();
let fixed_layout = if opf.is_fixed_layout {
eprintln!("Detected fixed-layout content");
Some(exth::FixedLayoutMeta {
is_fixed_layout: true,
original_resolution: opf.original_resolution.clone(),
page_progression_direction: opf.page_progression_direction.clone(),
})
} else {
None
};
let title = if opf.title.is_empty() {
"Book"
} else {
&opf.title
};
if kf8_only {
let kf8_text_count = kf8_section.text_records.len();
let kf8_null_pad = kf8_text_count + 1;
let kf8_first_image = if num_image_records > 0 {
kf8_null_pad + 1
} else {
0xFFFFFFFF
};
let kf8_fragment_start = kf8_null_pad + 1 + num_image_records;
let kf8_skeleton_start = kf8_fragment_start
+ kf8_section.fragment_indx.len()
+ kf8_section.cncx_records.len();
let kf8_ncx_start = kf8_skeleton_start + kf8_section.skeleton_indx.len();
let kf8_fdst_idx = kf8_ncx_start
+ kf8_section.ncx_indx.len()
+ kf8_section.ncx_cncx_records.len();
let kf8_flis_idx = kf8_fdst_idx + 1;
let kf8_fcis_idx = kf8_flis_idx + 1;
let kf8_datp_idx = kf8_fcis_idx + 1;
let kf8_srcs_idx = if srcs_record.is_some() {
Some(kf8_fcis_idx + 1)
} else {
None
};
let kf8_first_nonbook = kf8_text_count + 2;
let hd_container: Option<HdContainer> = if hd_images && num_image_records > 0 {
eprintln!("Building HD image container (CONT/CRES)...");
Some(build_hd_container(opf, &image_records, &thumb_hd_skip))
} else {
None
};
let hd_record_count = hd_container.as_ref().map_or(0, |hd| hd.total_record_count());
let hd_geometry_string: Option<String> = hd_container.as_ref().map(|hd| hd.geometry_string());
let total_records = 1 + kf8_text_count + 1 + num_image_records
+ kf8_section.fragment_indx.len()
+ kf8_section.cncx_records.len()
+ kf8_section.skeleton_indx.len()
+ kf8_section.ncx_indx.len()
+ kf8_section.ncx_cncx_records.len()
+ 1 + 1 + 1 + 1 + num_optional + 1 + hd_record_count;
let kf8_record0 = build_kf8_record0(
opf,
kf8_section.text_length,
kf8_text_count,
kf8_first_nonbook,
kf8_fdst_idx,
kf8_section.flow_count,
kf8_skeleton_start,
kf8_fragment_start,
kf8_ncx_start,
kf8_datp_idx,
kf8_flis_idx,
kf8_fcis_idx,
no_compress,
cover_offset,
thumb_offset,
kf8_cover_uri.as_deref(),
fixed_layout.as_ref(),
kf8_first_image,
creator_tag,
kf8_srcs_idx,
hd_geometry_string.as_deref(),
total_records,
doc_type,
);
let kf8_flis_rec = build_flis();
let kf8_fcis_rec = build_fcis(kf8_section.text_length, kf8_section.flow_count);
let eof = build_eof();
let null_pad_rec = vec![0x00u8, 0x00u8];
let mut all_records: Vec<Vec<u8>> = Vec::new();
all_records.push(kf8_record0);
all_records.extend(kf8_section.text_records);
all_records.push(null_pad_rec);
all_records.extend(image_records);
all_records.extend(kf8_section.fragment_indx);
all_records.extend(kf8_section.cncx_records);
all_records.extend(kf8_section.skeleton_indx);
all_records.extend(kf8_section.ncx_indx);
all_records.extend(kf8_section.ncx_cncx_records);
all_records.push(kf8_section.fdst);
all_records.push(kf8_flis_rec);
all_records.push(kf8_fcis_rec);
all_records.push(kf8_section.datp);
if let Some(srcs) = srcs_record {
all_records.push(srcs);
}
if let Some(cmet) = cmet_record {
all_records.push(cmet);
}
all_records.push(eof);
if let Some(hd) = hd_container {
all_records.extend(hd.into_records());
}
let hd_str = if hd_record_count > 0 {
format!(", HD: {}", hd_record_count)
} else {
String::new()
};
eprintln!(
"KF8-only: {} total records{}",
all_records.len(),
hd_str,
);
let palmdb = build_palmdb(title, &all_records);
std::fs::write(output_path, &palmdb)?;
eprintln!("Wrote {} ({} bytes)", output_path.display(), palmdb.len());
} else {
eprintln!("Building KF7 text content...");
let text_content = build_text_content(opf, false);
let text_content = if !href_to_recindex.is_empty() {
rewrite_image_src(&text_content, &href_to_recindex, &opf.spine_items)
} else {
text_content
};
let (text_records, text_length) = if no_compress {
eprintln!("Splitting KF7 text into uncompressed records...");
let result = split_text_uncompressed(&text_content);
eprintln!(
"Split KF7 text into {} uncompressed records ({} bytes)",
result.0.len(),
result.1
);
result
} else {
eprintln!("Compressing KF7 text...");
let result = compress_text(&text_content);
eprintln!(
"Compressed KF7 text into {} records ({} bytes uncompressed)",
result.0.len(),
result.1
);
result
};
let kf7_first_non_book = text_records.len() + 1;
let kf7_first_image = if num_image_records > 0 {
text_records.len() + 1
} else {
0xFFFFFFFF
};
let kf7_flis = text_records.len() + 1 + num_image_records;
let kf7_fcis = kf7_flis + 1;
let kf7_srcs_idx = if srcs_record.is_some() {
Some(kf7_fcis + 1)
} else {
None
};
let boundary_idx = 1 + text_records.len() + num_image_records + 2 + num_optional;
let kf8_record0_global = boundary_idx + 1;
let kf7_total = 1 + text_records.len() + num_image_records + 2 + num_optional;
let kf8_text_count = kf8_section.text_records.len();
let kf8_null_pad = kf8_text_count + 1;
let kf8_fragment_start = kf8_null_pad + 1;
let kf8_skeleton_start = kf8_fragment_start
+ kf8_section.fragment_indx.len()
+ kf8_section.cncx_records.len();
let kf8_ncx_start = kf8_skeleton_start + kf8_section.skeleton_indx.len();
let kf8_fdst_idx = kf8_ncx_start
+ kf8_section.ncx_indx.len()
+ kf8_section.ncx_cncx_records.len();
let kf8_flis_idx = kf8_fdst_idx + 1;
let kf8_fcis_idx = kf8_flis_idx + 1;
let kf8_datp_idx = kf8_fcis_idx + 1;
let kf8_first_nonbook = kf8_text_count + 2;
let hd_container: Option<HdContainer> = if hd_images && num_image_records > 0 {
eprintln!("Building HD image container (CONT/CRES)...");
Some(build_hd_container(opf, &image_records, &thumb_hd_skip))
} else {
None
};
let hd_record_count = hd_container.as_ref().map_or(0, |hd| hd.total_record_count());
let hd_geometry_string: Option<String> = hd_container.as_ref().map(|hd| hd.geometry_string());
let total_global_records = kf7_total + 1 + 1 + kf8_text_count + 1
+ kf8_section.fragment_indx.len()
+ kf8_section.cncx_records.len()
+ kf8_section.skeleton_indx.len()
+ kf8_section.ncx_indx.len()
+ kf8_section.ncx_cncx_records.len()
+ 1 + 1 + 3 + hd_record_count;
let empty_chars: HashSet<u32> = HashSet::new();
let kf7_record0 = build_record0(
opf,
text_length,
text_records.len(),
kf7_first_non_book,
0xFFFFFFFF_usize, 0xFFFFFFFF_usize, total_global_records,
kf7_flis,
kf7_fcis,
no_compress,
&empty_chars,
false,
kf7_first_image,
cover_offset,
thumb_offset,
kf8_cover_uri.as_deref(),
fixed_layout.as_ref(),
Some(6),
Some(kf8_record0_global as u32),
kf7_srcs_idx,
hd_geometry_string.as_deref(),
creator_tag,
doc_type,
);
let kf8_record0 = build_kf8_record0(
opf,
kf8_section.text_length,
kf8_text_count,
kf8_first_nonbook,
kf8_fdst_idx,
kf8_section.flow_count,
kf8_skeleton_start,
kf8_fragment_start,
kf8_ncx_start,
kf8_datp_idx,
kf8_flis_idx,
kf8_fcis_idx,
no_compress,
cover_offset,
thumb_offset,
kf8_cover_uri.as_deref(),
fixed_layout.as_ref(),
kf8_fdst_idx, creator_tag,
None, None, 0, doc_type,
);
let kf7_flis_rec = build_flis();
let kf7_fcis_rec = build_fcis(text_length, 1); let kf8_flis_rec = build_flis();
let kf8_fcis_rec = build_fcis(kf8_section.text_length, kf8_section.flow_count);
let eof = build_eof();
let boundary_rec = b"BOUNDARY".to_vec();
let null_pad_rec = vec![0x00u8, 0x00u8];
let mut all_records: Vec<Vec<u8>> = Vec::new();
all_records.push(kf7_record0);
all_records.extend(text_records);
all_records.extend(image_records);
all_records.push(kf7_flis_rec);
all_records.push(kf7_fcis_rec);
if let Some(srcs) = srcs_record {
all_records.push(srcs);
}
if let Some(cmet) = cmet_record {
all_records.push(cmet);
}
all_records.push(boundary_rec);
all_records.push(kf8_record0);
all_records.extend(kf8_section.text_records);
all_records.push(null_pad_rec);
all_records.extend(kf8_section.fragment_indx);
all_records.extend(kf8_section.cncx_records);
all_records.extend(kf8_section.skeleton_indx);
all_records.extend(kf8_section.ncx_indx);
all_records.extend(kf8_section.ncx_cncx_records);
all_records.push(kf8_section.fdst);
all_records.push(kf8_flis_rec);
all_records.push(kf8_fcis_rec);
all_records.push(kf8_section.datp);
all_records.push(eof);
if let Some(hd) = hd_container {
all_records.extend(hd.into_records());
}
let hd_str = if hd_record_count > 0 {
format!(", HD: {}", hd_record_count)
} else {
String::new()
};
eprintln!(
"Dual format: {} total records (KF7: {}, boundary: 1, KF8: {}{})",
all_records.len(),
kf7_total,
all_records.len() - kf7_total - 1 - hd_record_count,
hd_str,
);
let palmdb = build_palmdb(title, &all_records);
std::fs::write(output_path, &palmdb)?;
eprintln!("Wrote {} ({} bytes)", output_path.display(), palmdb.len());
}
Ok(())
}
struct HdContainer {
cont_record: Vec<u8>,
cres_records: Vec<Vec<u8>>,
kindle_embed_list: Vec<u8>,
max_width: u32,
max_height: u32,
num_cres_slots: usize,
}
impl HdContainer {
fn total_record_count(&self) -> usize {
1 + 1 + self.cres_records.len() + 1 + 1 + 1
}
fn geometry_string(&self) -> String {
let end = self.num_cres_slots + 2;
format!("{}x{}:0-{}|", self.max_width, self.max_height, end)
}
fn into_records(self) -> Vec<Vec<u8>> {
let mut records = Vec::with_capacity(self.total_record_count());
records.push(b"BOUNDARY".to_vec());
records.push(self.cont_record);
records.extend(self.cres_records);
records.push(self.kindle_embed_list);
records.push(b"CONTBOUNDARY".to_vec());
records.push(vec![0xE9, 0x8E, 0x0D, 0x0A]); records
}
}
fn build_hd_container(
opf: &OPFData,
image_records: &[Vec<u8>],
hd_skip: &std::collections::HashSet<usize>,
) -> HdContainer {
let title = if opf.title.is_empty() { "Book" } else { &opf.title };
let num_images = image_records.len();
let mut cres_records: Vec<Vec<u8>> = Vec::new();
let mut hd_image_count: u32 = 0;
let mut max_width: u32 = 0;
let mut max_height: u32 = 0;
let mut kindle_embed_parts: Vec<String> = Vec::new();
for (idx, img_data) in image_records.iter().enumerate() {
if img_data.is_empty() || hd_skip.contains(&idx) {
cres_records.push(vec![0xA0, 0xA0, 0xA0, 0xA0]);
continue;
}
let dims = get_image_dimensions(img_data);
if let Some((w, h)) = dims {
let mut cres = Vec::with_capacity(12 + img_data.len());
cres.extend_from_slice(b"CRES");
cres.extend_from_slice(&0u32.to_be_bytes()); cres.extend_from_slice(&12u32.to_be_bytes()); cres.extend_from_slice(img_data);
cres_records.push(cres);
hd_image_count += 1;
if w > max_width { max_width = w; }
if h > max_height { max_height = h; }
let recindex = idx + 1;
let embed_ref = format!(
"kindle:embed:{}?mime=image/jpg",
encode_kindle_embed_base32(recindex)
);
kindle_embed_parts.push(embed_ref);
} else {
cres_records.push(vec![0xA0, 0xA0, 0xA0, 0xA0]);
}
}
let kindle_embed_list = if kindle_embed_parts.is_empty() {
Vec::new()
} else {
let mut list_str = kindle_embed_parts.join("|");
list_str.push('|');
list_str.into_bytes()
};
let cont_record = build_cont_record(
title,
&opf.author,
num_images,
hd_image_count,
max_width,
max_height,
);
eprintln!(
"HD container: {} image slots, {} HD images, max {}x{}",
num_images, hd_image_count, max_width, max_height,
);
HdContainer {
cont_record,
cres_records,
kindle_embed_list,
max_width,
max_height,
num_cres_slots: num_images,
}
}
fn build_cont_record(
title: &str,
_author: &str,
num_cres_slots: usize,
num_hd_images: u32,
max_width: u32,
max_height: u32,
) -> Vec<u8> {
let kindle_embed_index = num_cres_slots + 1;
let container_total = num_cres_slots + 3;
let mut exth_records: Vec<Vec<u8>> = vec![
exth::exth_record(125, &4u32.to_be_bytes()),
exth::exth_record(204, &202u32.to_be_bytes()), exth::exth_record(205, &0u32.to_be_bytes()), exth::exth_record(206, &1u32.to_be_bytes()), exth::exth_record(535, format!("kindling-{}", env!("CARGO_PKG_VERSION")).as_bytes()),
exth::exth_record(207, &0u32.to_be_bytes()), exth::exth_record(539, b"application/image"), ];
let dims_str = format!("{}x{}", max_width, max_height);
exth_records.push(exth::exth_record(538, dims_str.as_bytes())); let title_bytes = if title.is_empty() { b"Book".to_vec() } else { title.as_bytes().to_vec() };
let title_hash = md5_simple(&title_bytes);
exth_records.push(exth::exth_record(542, &title_hash[..4]));
exth_records.push(exth::exth_record(543, b"HD_CONTAINER"));
let exth_record_data: Vec<u8> = exth_records.iter().flat_map(|r| r.iter().copied()).collect();
let exth_length = 12 + exth_record_data.len();
let exth_padding = (4 - (exth_length % 4)) % 4;
let exth_padded_length = exth_length + exth_padding;
let mut exth_block = Vec::with_capacity(exth_padded_length);
exth_block.extend_from_slice(b"EXTH");
exth_block.extend_from_slice(&(exth_padded_length as u32).to_be_bytes());
exth_block.extend_from_slice(&(exth_records.len() as u32).to_be_bytes());
exth_block.extend_from_slice(&exth_record_data);
exth_block.extend_from_slice(&vec![0u8; exth_padding]);
let title_raw = title.as_bytes();
let title_len = title_raw.len();
let header_size = 48;
let exth_offset = header_size + exth_block.len();
let title_area_size = std::cmp::max(256, title_len.div_ceil(4) * 4);
let total_size = header_size + exth_block.len() + title_area_size;
let mut record = Vec::with_capacity(total_size);
record.extend_from_slice(b"CONT");
record.extend_from_slice(&(total_size as u32).to_be_bytes());
record.extend_from_slice(&((1u32 << 16) | container_total as u32).to_be_bytes());
record.extend_from_slice(&65001u32.to_be_bytes()); record.extend_from_slice(&0u32.to_be_bytes()); record.extend_from_slice(&1u32.to_be_bytes()); record.extend_from_slice(&(num_cres_slots as u32).to_be_bytes()); record.extend_from_slice(&num_hd_images.to_be_bytes()); record.extend_from_slice(&(kindle_embed_index as u32).to_be_bytes()); record.extend_from_slice(&1u32.to_be_bytes()); record.extend_from_slice(&(exth_offset as u32).to_be_bytes()); record.extend_from_slice(&(title_len as u32).to_be_bytes());
record.extend_from_slice(&exth_block);
record.extend_from_slice(title_raw);
while record.len() < total_size {
record.push(0x00);
}
record
}
fn encode_kindle_embed_base32(recindex: usize) -> String {
const CHARS: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUV";
let mut result = [b'0'; 4];
let mut v = recindex;
for i in (0..4).rev() {
result[i] = CHARS[v % 32];
v /= 32;
}
String::from_utf8(result.to_vec()).unwrap()
}
fn build_thumbnail_record(cover_bytes: &[u8]) -> Option<Vec<u8>> {
const THUMB_BOX_W: u32 = 330;
const THUMB_BOX_H: u32 = 470;
const THUMB_QUALITY: u8 = 80;
let img = image::load_from_memory(cover_bytes).ok()?;
let thumb = img.thumbnail(THUMB_BOX_W, THUMB_BOX_H);
let mut buf: Vec<u8> = Vec::with_capacity(16 * 1024);
{
let cursor = std::io::Cursor::new(&mut buf);
let encoder =
image::codecs::jpeg::JpegEncoder::new_with_quality(cursor, THUMB_QUALITY);
thumb.write_with_encoder(encoder).ok()?;
}
Some(buf)
}
fn get_image_dimensions(data: &[u8]) -> Option<(u32, u32)> {
if data.len() < 24 {
return None;
}
if data[0] == 0xFF && data[1] == 0xD8 {
return get_jpeg_dimensions(data);
}
if data.len() >= 24 && data[0..8] == [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A] {
if &data[12..16] == b"IHDR" {
let w = u32::from_be_bytes([data[16], data[17], data[18], data[19]]);
let h = u32::from_be_bytes([data[20], data[21], data[22], data[23]]);
return Some((w, h));
}
}
if data.len() >= 10 && &data[0..3] == b"GIF" {
let w = u16::from_le_bytes([data[6], data[7]]) as u32;
let h = u16::from_le_bytes([data[8], data[9]]) as u32;
return Some((w, h));
}
None
}
fn get_jpeg_dimensions(data: &[u8]) -> Option<(u32, u32)> {
let mut i = 0;
while i + 1 < data.len() {
if data[i] != 0xFF {
i += 1;
continue;
}
let marker = data[i + 1];
match marker {
0xD8 => { i += 2;
}
0xD9 | 0xDA => { break;
}
0xC0..=0xC3 => {
if i + 9 <= data.len() {
let h = u16::from_be_bytes([data[i + 5], data[i + 6]]) as u32;
let w = u16::from_be_bytes([data[i + 7], data[i + 8]]) as u32;
return Some((w, h));
}
break;
}
0x00 => {
i += 2;
}
_ => {
if i + 4 <= data.len() {
let length = u16::from_be_bytes([data[i + 2], data[i + 3]]) as usize;
i += 2 + length;
} else {
break;
}
}
}
}
None
}
fn build_html_parts(opf: &OPFData) -> Vec<String> {
let mut parts: Vec<String> = Vec::new();
for html_path in opf.get_content_html_paths() {
let content = std::fs::read_to_string(&html_path).unwrap_or_default();
let cleaned = clean_book_html(&content);
parts.push(cleaned);
}
parts
}
fn extract_css_content(opf: &OPFData) -> String {
let mut css_parts: Vec<String> = Vec::new();
for (_, (href, media_type)) in &opf.manifest {
if media_type == "text/css" || href.ends_with(".css") {
let css_path = opf.base_dir.join(href);
if let Ok(content) = std::fs::read_to_string(&css_path) {
css_parts.push(content);
}
}
}
css_parts.join("\n")
}
fn build_text_content(opf: &OPFData, strip_idx: bool) -> Vec<u8> {
let mut parts: Vec<String> = Vec::new();
for html_path in opf.get_content_html_paths() {
let content = std::fs::read_to_string(&html_path).unwrap_or_default();
let cleaned = if strip_idx {
strip_idx_markup(&content)
} else {
clean_book_html(&content)
};
parts.push(cleaned);
}
let body_re = Regex::new(r"(?s)<body[^>]*>(.*?)</body>").unwrap();
let head_re = Regex::new(r"(?s)<head[^>]*>.*?</head>").unwrap();
let mut body_contents: Vec<String> = Vec::new();
let mut first_head: Option<String> = None;
for part in &parts {
if let Some(cap) = body_re.captures(part) {
body_contents.push(cap.get(1).unwrap().as_str().trim().to_string());
} else {
body_contents.push(part.clone());
}
if first_head.is_none() {
if let Some(cap) = head_re.captures(part) {
first_head = Some(cap.get(0).unwrap().as_str().to_string());
}
}
}
let head = first_head.unwrap_or_else(|| "<head><guide></guide></head>".to_string());
let merged_body = body_contents.join("<mbp:pagebreak/>");
let combined = format!(
"<html>{}<body>{} <mbp:pagebreak/></body></html>",
head, merged_body
);
combined.into_bytes()
}
fn build_text_content_by_letter(opf: &OPFData, entries: &[DictionaryEntry]) -> Vec<u8> {
let mut front_matter_sections: Vec<String> = Vec::new();
let mut first_style: Option<String> = None;
let mut has_frameset = false;
let body_re = Regex::new(r"(?s)<body[^>]*>(.*?)</body>").unwrap();
let style_re = Regex::new(r"(?s)<style[^>]*>.*?</style>").unwrap();
for html_path in opf.get_content_html_paths() {
if let Ok(content) = std::fs::read_to_string(&html_path) {
if content.contains("<idx:entry") {
if first_style.is_none() {
let head_re = Regex::new(r"(?s)<head>.*?</head>").unwrap();
if let Some(head_match) = head_re.find(&content) {
let styles: Vec<String> = style_re
.find_iter(head_match.as_str())
.map(|m| m.as_str().to_string())
.collect();
if !styles.is_empty() {
first_style = Some(styles.join(""));
}
}
}
if content.contains("<mbp:frameset") {
has_frameset = true;
}
continue; }
let cleaned = strip_idx_markup(&content);
if let Some(cap) = body_re.captures(&cleaned) {
front_matter_sections.push(cap.get(1).unwrap().as_str().trim().to_string());
} else {
front_matter_sections.push(cleaned);
}
}
}
use rayon::prelude::*;
let stripped_entries: Vec<String> = entries
.par_iter()
.map(|entry| strip_idx_markup(&entry.html_content))
.collect();
let mut dict_sections: Vec<String> = Vec::new();
let mut current_chunk = String::new();
for stripped in stripped_entries {
if !current_chunk.is_empty()
&& current_chunk.len() + stripped.len() > KINDLE_HTML_SIZE_LIMIT
{
dict_sections.push(current_chunk);
current_chunk = String::new();
}
current_chunk.push_str(&stripped);
}
if !current_chunk.is_empty() {
dict_sections.push(current_chunk);
}
eprintln!("Kindle limits: split {} entries into {} sections", entries.len(), dict_sections.len());
let total_sections = front_matter_sections.len() + dict_sections.len();
if total_sections > KINDLE_HTML_FILE_LIMIT {
eprintln!(
"Warning: {} total HTML sections exceeds the Kindle limit of {} files",
total_sections, KINDLE_HTML_FILE_LIMIT
);
}
let fm_body = front_matter_sections.join("<mbp:pagebreak/>");
let dict_body = dict_sections.join("<mbp:pagebreak/>");
let dict_body = if has_frameset {
format!("<mbp:frameset>{}</mbp:frameset>", dict_body)
} else {
dict_body
};
let merged_body = if fm_body.is_empty() {
dict_body
} else {
format!("{}<mbp:pagebreak/>{}", fm_body, dict_body)
};
let style_block = first_style.unwrap_or_default();
let combined = format!(
"<html><head>{}<guide></guide></head><body>{} <mbp:pagebreak/></body></html>",
style_block, merged_body
);
combined.into_bytes()
}
fn strip_idx_markup(html: &str) -> String {
use std::sync::OnceLock;
static XML_DECL: OnceLock<Regex> = OnceLock::new();
static XMLNS: OnceLock<Regex> = OnceLock::new();
static STYLE_RE: OnceLock<Regex> = OnceLock::new();
static HEAD_RE: OnceLock<Regex> = OnceLock::new();
static IFORM: OnceLock<Regex> = OnceLock::new();
static INFL_EMPTY: OnceLock<Regex> = OnceLock::new();
static INFL_FULL: OnceLock<Regex> = OnceLock::new();
static ORTH_SELF: OnceLock<Regex> = OnceLock::new();
static ORTH_OPEN: OnceLock<Regex> = OnceLock::new();
static ORTH_CLOSE: OnceLock<Regex> = OnceLock::new();
static SHORT_OPEN: OnceLock<Regex> = OnceLock::new();
static SHORT_CLOSE: OnceLock<Regex> = OnceLock::new();
static ENTRY_OPEN: OnceLock<Regex> = OnceLock::new();
static ENTRY_CLOSE: OnceLock<Regex> = OnceLock::new();
static CLASS_ATTR: OnceLock<Regex> = OnceLock::new();
static CLASS_ATTR_SQ: OnceLock<Regex> = OnceLock::new();
static STYLE_ATTR: OnceLock<Regex> = OnceLock::new();
static STYLE_ATTR_SQ: OnceLock<Regex> = OnceLock::new();
static WS: OnceLock<Regex> = OnceLock::new();
static TAG_SPACE: OnceLock<Regex> = OnceLock::new();
static DOCTYPE_RE: OnceLock<Regex> = OnceLock::new();
static HTML_OPEN: OnceLock<Regex> = OnceLock::new();
static HTML_CLOSE: OnceLock<Regex> = OnceLock::new();
static BODY_OPEN: OnceLock<Regex> = OnceLock::new();
static BODY_CLOSE: OnceLock<Regex> = OnceLock::new();
let xml_decl = XML_DECL.get_or_init(|| Regex::new(r"<\?xml[^?]*\?>\s*").unwrap());
let xmlns = XMLNS.get_or_init(|| Regex::new(r#"\s+xmlns:\w+="[^"]*""#).unwrap());
let style_re = STYLE_RE.get_or_init(|| Regex::new(r"(?s)<style[^>]*>.*?</style>").unwrap());
let head_re = HEAD_RE.get_or_init(|| Regex::new(r"(?s)<head>.*?</head>").unwrap());
let iform = IFORM.get_or_init(|| Regex::new(r"<idx:iform[^/]*/>\s*").unwrap());
let infl_empty = INFL_EMPTY.get_or_init(|| Regex::new(r"<idx:infl>\s*</idx:infl>\s*").unwrap());
let infl_full =
INFL_FULL.get_or_init(|| Regex::new(r"(?s)\s*<idx:infl>.*?</idx:infl>\s*").unwrap());
let orth_self = ORTH_SELF.get_or_init(|| Regex::new(r"<idx:orth[^>]*/>").unwrap());
let orth_open = ORTH_OPEN.get_or_init(|| Regex::new(r"<idx:orth[^>]*>").unwrap());
let orth_close = ORTH_CLOSE.get_or_init(|| Regex::new(r"</idx:orth>").unwrap());
let short_open = SHORT_OPEN.get_or_init(|| Regex::new(r"<idx:short>\s*").unwrap());
let short_close = SHORT_CLOSE.get_or_init(|| Regex::new(r"\s*</idx:short>").unwrap());
let entry_open = ENTRY_OPEN.get_or_init(|| Regex::new(r"<idx:entry[^>]*>\s*").unwrap());
let entry_close = ENTRY_CLOSE.get_or_init(|| Regex::new(r"\s*</idx:entry>").unwrap());
let class_attr = CLASS_ATTR.get_or_init(|| Regex::new(r#"\s+class\s*=\s*"[^"]*""#).unwrap());
let class_attr_sq =
CLASS_ATTR_SQ.get_or_init(|| Regex::new(r#"\s+class\s*=\s*'[^']*'"#).unwrap());
let style_attr = STYLE_ATTR.get_or_init(|| Regex::new(r#"\s+style\s*=\s*"[^"]*""#).unwrap());
let style_attr_sq =
STYLE_ATTR_SQ.get_or_init(|| Regex::new(r#"\s+style\s*=\s*'[^']*'"#).unwrap());
let ws = WS.get_or_init(|| Regex::new(r"\s+").unwrap());
let tag_space = TAG_SPACE.get_or_init(|| Regex::new(r">\s+<").unwrap());
let doctype_re =
DOCTYPE_RE.get_or_init(|| Regex::new(r"(?i)<!DOCTYPE[^>]*>\s*").unwrap());
let html_open = HTML_OPEN.get_or_init(|| Regex::new(r"(?i)<html\b[^>]*>\s*").unwrap());
let html_close = HTML_CLOSE.get_or_init(|| Regex::new(r"(?i)\s*</html\s*>").unwrap());
let body_open = BODY_OPEN.get_or_init(|| Regex::new(r"(?i)<body\b[^>]*>\s*").unwrap());
let body_close = BODY_CLOSE.get_or_init(|| Regex::new(r"(?i)\s*</body\s*>").unwrap());
let mut result: std::borrow::Cow<str> = std::borrow::Cow::Borrowed(html);
if result.contains("<?xml") {
result = std::borrow::Cow::Owned(xml_decl.replace_all(&result, "").to_string());
}
if result.contains("xmlns:") {
result = std::borrow::Cow::Owned(xmlns.replace_all(&result, "").to_string());
}
if result.contains("<head") {
let style_block: String = head_re
.find(&result)
.map(|head_match| {
style_re
.find_iter(head_match.as_str())
.map(|m| m.as_str().to_string())
.collect::<Vec<_>>()
.join("")
})
.unwrap_or_default();
let new_head = if style_block.is_empty() {
"<head><guide></guide></head>".to_string()
} else {
format!("<head>{}<guide></guide></head>", style_block)
};
result = std::borrow::Cow::Owned(
head_re.replace_all(&result, new_head.as_str()).to_string(),
);
}
if result.contains("<idx:iform") {
result = std::borrow::Cow::Owned(iform.replace_all(&result, "").to_string());
}
if result.contains("<idx:infl") {
result = std::borrow::Cow::Owned(infl_empty.replace_all(&result, "").to_string());
result = std::borrow::Cow::Owned(infl_full.replace_all(&result, "").to_string());
}
if result.contains("<idx:orth") {
result = std::borrow::Cow::Owned(orth_self.replace_all(&result, "").to_string());
result = std::borrow::Cow::Owned(orth_open.replace_all(&result, "").to_string());
}
if result.contains("</idx:orth>") {
result = std::borrow::Cow::Owned(orth_close.replace_all(&result, "").to_string());
}
if result.contains("<idx:short>") {
result = std::borrow::Cow::Owned(short_open.replace_all(&result, "").to_string());
}
if result.contains("</idx:short>") {
result = std::borrow::Cow::Owned(short_close.replace_all(&result, "").to_string());
}
if result.contains("<idx:entry") {
result = std::borrow::Cow::Owned(entry_open.replace_all(&result, "").to_string());
}
if result.contains("</idx:entry>") {
result = std::borrow::Cow::Owned(entry_close.replace_all(&result, "<hr/>").to_string());
}
if result.contains("class") {
result = std::borrow::Cow::Owned(class_attr.replace_all(&result, "").to_string());
result = std::borrow::Cow::Owned(class_attr_sq.replace_all(&result, "").to_string());
}
if result.contains("style") {
result = std::borrow::Cow::Owned(style_attr.replace_all(&result, "").to_string());
result = std::borrow::Cow::Owned(style_attr_sq.replace_all(&result, "").to_string());
}
if result.contains("<!DOCTYPE") || result.contains("<!doctype") {
result = std::borrow::Cow::Owned(doctype_re.replace_all(&result, "").to_string());
}
if result.contains("<html") || result.contains("<HTML") {
result = std::borrow::Cow::Owned(html_open.replace_all(&result, "").to_string());
}
if result.contains("</html") || result.contains("</HTML") {
result = std::borrow::Cow::Owned(html_close.replace_all(&result, "").to_string());
}
if result.contains("<body") || result.contains("<BODY") {
result = std::borrow::Cow::Owned(body_open.replace_all(&result, "").to_string());
}
if result.contains("</body") || result.contains("</BODY") {
result = std::borrow::Cow::Owned(body_close.replace_all(&result, "").to_string());
}
result = std::borrow::Cow::Owned(ws.replace_all(&result, " ").to_string());
result = std::borrow::Cow::Owned(tag_space.replace_all(&result, "><").to_string());
let mut result = result.into_owned();
if result.contains("</b><") {
result = result.replace("</b><", "</b> <");
}
if result.contains("</p><hr") {
result = result.replace("</p><hr", "</p> <hr");
}
if result.contains("/><b>") {
result = result.replace("/><b>", "/> <b>");
}
result.trim().to_string()
}
fn clean_book_html(html: &str) -> String {
let mut result = html.to_string();
let xmlns = Regex::new(r#"\s+xmlns:\w+="[^"]*""#).unwrap();
result = xmlns.replace_all(&result, "").to_string();
result.trim().to_string()
}
fn rewrite_image_src(
text_bytes: &[u8],
href_to_recindex: &std::collections::HashMap<String, usize>,
spine_items: &[(String, String)],
) -> Vec<u8> {
let text = String::from_utf8_lossy(text_bytes);
let mut path_to_recindex: std::collections::HashMap<String, usize> =
std::collections::HashMap::new();
for (href, &recindex) in href_to_recindex {
path_to_recindex.insert(href.clone(), recindex);
let decoded = percent_decode(href);
if decoded != *href {
path_to_recindex.insert(decoded, recindex);
}
if let Some(fname) = href.rsplit('/').next() {
path_to_recindex.entry(fname.to_string()).or_insert(recindex);
}
}
for (_, spine_href) in spine_items {
if let Some(spine_dir) = spine_href.rsplit_once('/') {
let spine_dir = spine_dir.0; for (href, &recindex) in href_to_recindex {
let relative = format!("../{}", href);
path_to_recindex.entry(relative).or_insert(recindex);
if let Some(img_dir) = href.rsplit_once('/') {
if spine_dir != img_dir.0 {
let relative2 = format!("../{}", href);
path_to_recindex.entry(relative2).or_insert(recindex);
} else {
let fname = img_dir.1;
path_to_recindex.entry(fname.to_string()).or_insert(recindex);
}
}
}
}
}
let src_re = Regex::new(r#"(?i)\bsrc\s*=\s*"([^"]*)""#).unwrap();
let result = src_re.replace_all(&text, |caps: ®ex::Captures| {
let src_path = caps.get(1).unwrap().as_str();
if let Some(&recindex) = path_to_recindex.get(src_path) {
format!("recindex=\"{:05}\"", recindex)
} else {
let decoded = percent_decode(src_path);
if let Some(&recindex) = path_to_recindex.get(&decoded) {
format!("recindex=\"{:05}\"", recindex)
} else {
if let Some(fname) = src_path.rsplit('/').next() {
if let Some(&recindex) = path_to_recindex.get(fname) {
format!("recindex=\"{:05}\"", recindex)
} else {
caps.get(0).unwrap().as_str().to_string()
}
} else {
caps.get(0).unwrap().as_str().to_string()
}
}
}
});
result.into_owned().into_bytes()
}
fn percent_decode(s: &str) -> String {
let mut result = String::with_capacity(s.len());
let mut chars = s.bytes();
while let Some(b) = chars.next() {
if b == b'%' {
let h1 = chars.next();
let h2 = chars.next();
if let (Some(h1), Some(h2)) = (h1, h2) {
if let Ok(byte) = u8::from_str_radix(
&format!("{}{}", h1 as char, h2 as char),
16,
) {
result.push(byte as char);
continue;
}
}
result.push('%');
} else {
result.push(b as char);
}
}
result
}
fn insert_guide_reference(text_bytes: &[u8]) -> Vec<u8> {
let empty_guide = b"<guide></guide>";
let guide_pos = match find_bytes(text_bytes, empty_guide) {
Some(pos) => pos,
None => return text_bytes.to_vec(),
};
let first_b = match find_bytes(text_bytes, b"<b>") {
Some(pos) => pos,
None => return text_bytes.to_vec(),
};
let ref_template_zero = b"<guide><reference title=\"IndexName\" type=\"index\" filepos=0000000000 /></guide>";
let insert_delta = ref_template_zero.len() - empty_guide.len();
let filepos = first_b + insert_delta;
let full_guide = format!(
"<guide><reference title=\"IndexName\" type=\"index\" filepos={:010} /></guide>",
filepos
);
let mut result = Vec::with_capacity(text_bytes.len() + insert_delta);
result.extend_from_slice(&text_bytes[..guide_pos]);
result.extend_from_slice(full_guide.as_bytes());
result.extend_from_slice(&text_bytes[guide_pos + empty_guide.len()..]);
result
}
const PARALLEL_THRESHOLD: usize = 1024 * 1024;
fn compute_chunk_size(total_length: usize) -> usize {
let mut chunk_size = RECORD_SIZE;
if total_length / chunk_size > 65000 {
chunk_size = (total_length / 65000) + 1;
chunk_size = chunk_size.next_power_of_two();
}
chunk_size
}
fn incomplete_utf8_tail_bytes(chunk: &[u8]) -> usize {
let mut trailing = 0usize;
while trailing < 3 && trailing < chunk.len() {
let b = chunk[chunk.len() - 1 - trailing];
if (0x80..=0xBF).contains(&b) {
trailing += 1;
continue;
}
let expected = if b < 0x80 {
1
} else if (0xC0..=0xDF).contains(&b) {
2
} else if (0xE0..=0xEF).contains(&b) {
3
} else if (0xF0..=0xF7).contains(&b) {
4
} else {
return 0;
};
let have = trailing + 1;
return if have < expected { have } else { 0 };
}
0
}
fn pad_text_for_chunking(text: &[u8], chunk_size: usize) -> Vec<u8> {
let mut padded: Vec<u8> = Vec::with_capacity(text.len() + 8 * (text.len() / chunk_size + 1));
let mut src = 0usize;
while src < text.len() {
let remaining_in_chunk = chunk_size - (padded.len() % chunk_size);
let avail = text.len() - src;
if avail <= remaining_in_chunk {
padded.extend_from_slice(&text[src..]);
break;
}
let chunk = &text[src..src + remaining_in_chunk];
let mut split_at: Option<usize> = None;
for i in (1..chunk.len()).rev() {
if chunk[i - 1] == b'>' && chunk[i] == b'<' {
split_at = Some(i);
break;
}
}
let safe_n = match split_at {
Some(n) => n,
None => {
let trailing = incomplete_utf8_tail_bytes(chunk);
if trailing >= remaining_in_chunk {
1
} else {
remaining_in_chunk - trailing
}
}
};
padded.extend_from_slice(&text[src..src + safe_n]);
src += safe_n;
let pad_count = remaining_in_chunk - safe_n;
for _ in 0..pad_count {
padded.push(b' ');
}
}
padded
}
fn split_on_utf8_boundaries(text_bytes: &[u8], chunk_size: usize) -> Vec<(usize, usize)> {
let mut ranges: Vec<(usize, usize)> = Vec::new();
let total = text_bytes.len();
let mut start = 0usize;
while start < total {
let end = (start + chunk_size).min(total);
ranges.push((start, end));
start = end;
}
ranges
}
fn compress_text(text_bytes: &[u8]) -> (Vec<Vec<u8>>, usize) {
let total_length = text_bytes.len();
let chunk_size = compute_chunk_size(total_length);
let chunks: Vec<Vec<u8>> = split_on_utf8_boundaries(text_bytes, chunk_size)
.into_iter()
.map(|(s, e)| text_bytes[s..e].to_vec())
.collect();
let records = if total_length > PARALLEL_THRESHOLD && chunks.len() > 1 {
let num_workers = std::thread::available_parallelism()
.map(|n| n.get())
.unwrap_or(1)
.min(chunks.len());
eprintln!(
" Using {} workers for parallel compression ({} chunks)...",
num_workers,
chunks.len()
);
let chunk_count = chunks.len();
let chunks = std::sync::Arc::new(chunks);
let mut handles = Vec::with_capacity(num_workers);
for worker_id in 0..num_workers {
let chunks = std::sync::Arc::clone(&chunks);
handles.push(std::thread::spawn(move || {
let mut results: Vec<(usize, Vec<u8>)> = Vec::new();
let mut idx = worker_id;
while idx < chunk_count {
let mut compressed = palmdoc::compress(&chunks[idx]);
compressed.push(0x00);
compressed.push(0x81);
results.push((idx, compressed));
idx += num_workers;
}
results
}));
}
let mut indexed_results: Vec<(usize, Vec<u8>)> = Vec::with_capacity(chunk_count);
for handle in handles {
indexed_results.extend(handle.join().unwrap());
}
indexed_results.sort_by_key(|(idx, _)| *idx);
indexed_results.into_iter().map(|(_, data)| data).collect()
} else {
chunks
.iter()
.map(|chunk| {
let mut compressed = palmdoc::compress(chunk);
compressed.push(0x00);
compressed.push(0x81);
compressed
})
.collect()
};
(records, total_length)
}
fn split_text_uncompressed(text_bytes: &[u8]) -> (Vec<Vec<u8>>, usize) {
let total_length = text_bytes.len();
let chunk_size = compute_chunk_size(total_length);
let records: Vec<Vec<u8>> = split_on_utf8_boundaries(text_bytes, chunk_size)
.into_iter()
.map(|(s, e)| {
let mut rec = text_bytes[s..e].to_vec();
rec.push(0x00);
rec.push(0x81);
rec
})
.collect();
(records, total_length)
}
#[cfg(test)]
mod record_split_tests {
use super::*;
fn strip_trailers(rec: &[u8]) -> &[u8] {
assert!(rec.len() >= 2, "record too small to contain trailers");
let tbs_byte = rec[rec.len() - 1];
assert_eq!(tbs_byte, 0x81, "TBS byte must be 0x81 at end of record");
let mb_byte = rec[rec.len() - 2];
assert_eq!(mb_byte & 0x3, 0, "multibyte byte overhang must be 0");
&rec[..rec.len() - 2]
}
fn palmdoc_decompress(src: &[u8]) -> Vec<u8> {
let mut out: Vec<u8> = Vec::with_capacity(src.len() * 2);
let mut i = 0usize;
while i < src.len() {
let b = src[i];
i += 1;
if b == 0 {
out.push(0);
} else if (1..=8).contains(&b) {
let n = b as usize;
let end = (i + n).min(src.len());
out.extend_from_slice(&src[i..end]);
i = end;
} else if (9..=0x7F).contains(&b) {
out.push(b);
} else if (0x80..=0xBF).contains(&b) {
if i >= src.len() {
break;
}
let b2 = src[i];
i += 1;
let word = ((b as u16) << 8) | (b2 as u16);
let distance = ((word >> 3) & 0x7FF) as usize;
let length = ((word & 0x7) as usize) + 3;
if distance == 0 || distance > out.len() {
break;
}
let start = out.len() - distance;
for k in 0..length {
let byte = out[start + k];
out.push(byte);
}
} else {
out.push(0x20);
out.push(b ^ 0x80);
}
}
out
}
#[test]
fn incomplete_utf8_tail_bytes_on_clean_ascii() {
assert_eq!(incomplete_utf8_tail_bytes(b"hello"), 0);
}
#[test]
fn incomplete_utf8_tail_bytes_on_complete_two_byte() {
assert_eq!(incomplete_utf8_tail_bytes("αβ".as_bytes()), 0);
}
#[test]
fn incomplete_utf8_tail_bytes_on_lead_only_two_byte() {
let bytes = [0xCEu8];
assert_eq!(incomplete_utf8_tail_bytes(&bytes), 1);
}
#[test]
fn incomplete_utf8_tail_bytes_on_lead_only_three_byte() {
assert_eq!(incomplete_utf8_tail_bytes(&[0xE2]), 1);
assert_eq!(incomplete_utf8_tail_bytes(&[0xE2, 0x80]), 2);
assert_eq!(incomplete_utf8_tail_bytes(&[0xE2, 0x80, 0xA0]), 0);
}
#[test]
fn incomplete_utf8_tail_bytes_on_lead_only_four_byte() {
assert_eq!(incomplete_utf8_tail_bytes(&[0xF0]), 1);
assert_eq!(incomplete_utf8_tail_bytes(&[0xF0, 0x9F]), 2);
assert_eq!(incomplete_utf8_tail_bytes(&[0xF0, 0x9F, 0x98]), 3);
assert_eq!(incomplete_utf8_tail_bytes(&[0xF0, 0x9F, 0x98, 0x80]), 0);
}
#[test]
fn split_on_utf8_boundaries_fixed_chunks() {
let mut text = vec![b'x'; 4095];
text.extend_from_slice("α".as_bytes()); text.extend_from_slice(&[b'y'; 100]);
let ranges = split_on_utf8_boundaries(&text, 4096);
assert_eq!(ranges[0], (0, 4096));
}
#[test]
fn split_on_utf8_boundaries_reassembles_to_original() {
let mut text: Vec<u8> = Vec::new();
for i in 0..5000 {
if i % 7 == 0 {
text.extend_from_slice("αβγ".as_bytes());
} else if i % 11 == 0 {
text.extend_from_slice("\u{2020}".as_bytes());
} else {
text.push(b'a' + (i as u8 % 26));
}
}
let ranges = split_on_utf8_boundaries(&text, 4096);
let mut reassembled = Vec::with_capacity(text.len());
for (s, e) in &ranges {
reassembled.extend_from_slice(&text[*s..*e]);
assert!(e - s <= 4096);
}
assert_eq!(reassembled, text);
}
fn count_tag_balance(bytes: &[u8], tag: &str) -> i32 {
let haystack = std::str::from_utf8(bytes).unwrap_or("");
let open = format!("<{}>", tag);
let close = format!("</{}>", tag);
let opens = haystack.matches(&open).count() as i32;
let closes = haystack.matches(&close).count() as i32;
opens - closes
}
#[test]
fn strip_trailers_removes_tbs_and_multibyte() {
let mut rec = b"hello world".to_vec();
rec.push(0x00); rec.push(0x81); let stripped = strip_trailers(&rec);
assert_eq!(stripped, b"hello world");
}
#[test]
fn palmdoc_compress_decompress_round_trip() {
let original = b"The quick brown fox jumps over the lazy dog. \
The quick brown fox jumps over the lazy dog.";
let compressed = super::palmdoc::compress(original);
let decompressed = palmdoc_decompress(&compressed);
assert_eq!(decompressed, original);
}
#[test]
fn count_tag_balance_balanced_and_unbalanced() {
let balanced = b"<p>hello</p><p>world</p>";
assert_eq!(count_tag_balance(balanced, "p"), 0);
let unbalanced = b"<p>hello<p>world</p>";
assert_eq!(count_tag_balance(unbalanced, "p"), 1);
}
}
fn scan_for_bold_at_boundary(
text_bytes: &[u8],
needle: &[u8],
start: usize,
) -> Option<(usize, usize)> {
let mut scan_from = start;
loop {
match find_bytes_from(text_bytes, needle, scan_from) {
Some(bold_pos) => {
if is_entry_boundary(text_bytes, bold_pos) {
return Some((bold_pos, bold_pos + 3));
}
scan_from = bold_pos + needle.len();
}
None => return None,
}
}
}
fn find_entry_positions(text_bytes: &[u8], entries: &[DictionaryEntry]) -> Vec<(usize, usize)> {
let mut positions = Vec::with_capacity(entries.len());
let mut search_start: usize = 0;
for entry in entries {
let escaped_hw = entry.headword
.replace('&', "&")
.replace('\'', "'")
.replace('"', """)
.replace('<', "<")
.replace('>', ">");
let headword_bytes = escaped_hw.as_bytes();
let raw_entities_hw = if entry.headword.contains('&') || entry.headword.contains('\'') {
Some(entry.headword
.replace('"', """)
.replace('<', "<")
.replace('>', ">"))
} else {
None
};
let mut bold_needle = Vec::with_capacity(3 + headword_bytes.len() + 4);
bold_needle.extend_from_slice(b"<b>");
bold_needle.extend_from_slice(headword_bytes);
bold_needle.extend_from_slice(b"</b>");
let mut found = scan_for_bold_at_boundary(text_bytes, &bold_needle, search_start);
if found.is_none() {
if let Some(raw_hw) = &raw_entities_hw {
let raw_bytes = raw_hw.as_bytes();
let mut raw_needle = Vec::with_capacity(3 + raw_bytes.len() + 4);
raw_needle.extend_from_slice(b"<b>");
raw_needle.extend_from_slice(raw_bytes);
raw_needle.extend_from_slice(b"</b>");
found = scan_for_bold_at_boundary(text_bytes, &raw_needle, search_start);
}
}
let (block_start, pos) = match found {
Some(result) => result,
None => {
let mut bare_found: Option<(usize, usize)> = None;
let mut scan_from = search_start;
loop {
match find_bytes_from(text_bytes, headword_bytes, scan_from) {
Some(p) => {
let search_from = if p >= 10 { p - 10 } else { 0 };
let bs = match rfind_bytes(&text_bytes[search_from..p], b"<b>") {
Some(rel) => search_from + rel,
None => p,
};
if is_entry_boundary(text_bytes, bs) {
bare_found = Some((bs, p));
break;
}
scan_from = p + headword_bytes.len();
}
None => break,
}
}
match bare_found {
Some(result) => result,
None => {
positions.push((0, 0));
continue;
}
}
}
};
let hr_pos = find_bytes_from(text_bytes, b"<hr/>", pos);
let text_len = match hr_pos {
Some(hr) => hr - block_start,
None => {
let block_end =
find_bytes_from(text_bytes, b"<mbp:pagebreak/>", pos).unwrap_or(text_bytes.len());
block_end - block_start
}
};
positions.push((block_start, text_len));
search_start = pos + headword_bytes.len();
}
let unfound: Vec<_> = entries.iter().zip(positions.iter())
.filter(|(_, (s, l))| *s == 0 && *l == 0)
.map(|(e, _)| e.headword.clone())
.collect();
if !unfound.is_empty() {
eprintln!("Warning: {} / {} entries not found in text blob", unfound.len(), entries.len());
for hw in unfound.iter().take(20) {
eprintln!(" Not found: {:?}", hw);
}
if unfound.len() > 20 {
eprintln!(" ... and {} more", unfound.len() - 20);
}
}
positions
}
fn is_entry_boundary(text_bytes: &[u8], bold_pos: usize) -> bool {
if bold_pos < 200 {
return true;
}
let mut end = bold_pos;
while end > 0 && text_bytes[end - 1] == b' ' {
end -= 1;
}
let check_start = if end >= 8 { end - 8 } else { 0 };
let preceding = &text_bytes[check_start..end];
if preceding.ends_with(b"<h5>") {
return true;
}
if preceding.ends_with(b"<hr/> ") || preceding.ends_with(b"<hr/>") {
return true;
}
if preceding.ends_with(b"/> ") || preceding.ends_with(b"/>") {
let wider_start = if end >= 24 { end - 24 } else { 0 };
let wider = &text_bytes[wider_start..end];
if wider.ends_with(b"<hr/>")
|| wider.ends_with(b"<mbp:pagebreak/>")
|| wider.ends_with(b"<h5>")
{
return true;
}
}
false
}
fn build_lookup_terms(
entries: &[DictionaryEntry],
positions: &[(usize, usize)],
text_bytes: &[u8],
headwords_only: bool,
) -> Vec<LookupTerm> {
use std::collections::HashMap;
let mut terms: HashMap<String, (usize, usize, usize, usize)> = HashMap::new();
let mut headwords: HashSet<String> = HashSet::new();
for (entry_ordinal, (entry, &(start_pos, text_len))) in
entries.iter().zip(positions.iter()).enumerate()
{
let hw = &entry.headword;
let hw_bytes = hw.as_bytes();
let mut hw_display_len = 3 + hw_bytes.len() + 4 + 1;
if start_pos > 0 && start_pos + hw_display_len <= text_bytes.len() {
let mut expected = Vec::new();
expected.extend_from_slice(b"<b>");
expected.extend_from_slice(hw_bytes);
expected.extend_from_slice(b"</b> ");
let actual = &text_bytes[start_pos..start_pos + hw_display_len];
if actual != expected.as_slice() {
hw_display_len = 3 + hw_bytes.len() + 4;
}
}
terms.insert(
hw.clone(),
(start_pos, text_len, hw_display_len, entry_ordinal),
);
headwords.insert(hw.clone());
}
if !headwords_only {
let bad_chars: HashSet<char> = "()[]{}".chars().collect();
for (entry_ordinal, (entry, &(start_pos, text_len))) in
entries.iter().zip(positions.iter()).enumerate()
{
for iform in &entry.inflections {
if !terms.contains_key(iform)
&& !iform.chars().any(|c| bad_chars.contains(&c))
{
let hw = &entry.headword;
let hw_display_len = if let Some((_, _, hdl, _)) = terms.get(hw) {
*hdl
} else {
3 + iform.as_bytes().len() + 4 + 1
};
terms.insert(
iform.clone(),
(start_pos, text_len, hw_display_len, entry_ordinal),
);
}
}
}
}
eprintln!("Encoding {} unique lookup terms...", terms.len());
let mut label_bytes_map: HashMap<String, Vec<u8>> = HashMap::new();
for label in terms.keys() {
label_bytes_map.insert(label.clone(), indx::encode_indx_label(label));
}
let mut sorted_labels: Vec<String> = terms.keys().cloned().collect();
sorted_labels.sort_by(|a, b| label_bytes_map[a].cmp(&label_bytes_map[b]));
sorted_labels
.into_iter()
.map(|label| {
let (start_pos, text_len, hw_display_len, source_ordinal) = terms[&label];
LookupTerm {
label: label.clone(),
label_bytes: label_bytes_map[&label].clone(),
start_pos,
text_len,
headword_display_len: hw_display_len,
source_ordinal,
}
})
.collect()
}
#[allow(clippy::too_many_arguments)]
fn build_record0(
opf: &OPFData,
text_length: usize,
text_record_count: usize,
first_non_book_record: usize,
orth_index_record: usize,
infl_index_record: usize,
_total_records: usize,
flis_record: usize,
fcis_record: usize,
no_compress: bool,
headword_chars: &HashSet<u32>,
is_dictionary: bool,
first_image_record: usize,
cover_offset: Option<u32>,
thumb_offset: Option<u32>,
kf8_cover_uri: Option<&str>,
fixed_layout: Option<&exth::FixedLayoutMeta>,
override_version: Option<u32>,
kf8_boundary_record: Option<u32>,
srcs_record: Option<usize>,
hd_geometry: Option<&str>,
creator_tag: bool,
doc_type: Option<&str>,
) -> Vec<u8> {
let default_name = if is_dictionary { "Dictionary" } else { "Book" };
let full_name = if opf.title.is_empty() {
default_name
} else {
&opf.title
};
let full_name_bytes = full_name.as_bytes();
let compression_type: u16 = if no_compress { 1 } else { 2 };
let mut record_size = RECORD_SIZE;
let mut text_rec_count = text_record_count;
if text_rec_count > 65000 {
record_size = std::cmp::max(RECORD_SIZE, (text_length / 65000) + 1);
text_rec_count = std::cmp::min(text_rec_count, 65535);
}
let mut palmdoc = Vec::with_capacity(16);
palmdoc.extend_from_slice(&compression_type.to_be_bytes());
palmdoc.extend_from_slice(&0u16.to_be_bytes());
palmdoc.extend_from_slice(&(text_length as u32).to_be_bytes());
palmdoc.extend_from_slice(&(text_rec_count as u16).to_be_bytes());
palmdoc.extend_from_slice(&(record_size as u16).to_be_bytes());
palmdoc.extend_from_slice(&0u16.to_be_bytes());
palmdoc.extend_from_slice(&0u16.to_be_bytes());
assert_eq!(palmdoc.len(), 16);
let mut mobi = vec![0u8; MOBI_HEADER_LENGTH];
put_bytes(&mut mobi, 0, b"MOBI");
put32(&mut mobi, 4, MOBI_HEADER_LENGTH as u32);
put32(&mut mobi, 8, 2);
put32(&mut mobi, 12, 65001);
let uid_hash = md5_simple(full_name.as_bytes());
let unique_id = u32::from_be_bytes([uid_hash[0], uid_hash[1], uid_hash[2], uid_hash[3]]);
put32(&mut mobi, 16, unique_id);
let version = override_version.unwrap_or(7);
put32(&mut mobi, 20, version); put32(&mut mobi, 24, orth_index_record as u32);
put32(&mut mobi, 28, infl_index_record as u32); put32(&mut mobi, 32, 0xFFFFFFFF); put32(&mut mobi, 36, 0xFFFFFFFF); for off in (40..64).step_by(4) {
put32(&mut mobi, off, 0xFFFFFFFF); }
put32(&mut mobi, 64, first_non_book_record as u32);
put32(&mut mobi, 76, locale_code(&opf.language));
put32(&mut mobi, 80, locale_code(&opf.dict_in_language));
put32(&mut mobi, 84, locale_code(&opf.dict_out_language));
put32(&mut mobi, 88, version); put32(&mut mobi, 92, first_image_record as u32); put32(&mut mobi, 96, 0); put32(&mut mobi, 100, 0);
if is_dictionary {
put32(&mut mobi, 112, 0x50);
} else {
put32(&mut mobi, 112, 0x850);
}
put32(&mut mobi, 148, 0xFFFFFFFF); put32(&mut mobi, 152, 0xFFFFFFFF);
put32(
&mut mobi,
176,
(1u32 << 16) | ((flis_record - 1) as u32),
);
put32(&mut mobi, 180, 1);
put32(&mut mobi, 184, fcis_record as u32);
put32(&mut mobi, 188, 1);
put32(&mut mobi, 192, flis_record as u32);
put32(&mut mobi, 196, 1);
put32(&mut mobi, 224, 3);
put32(&mut mobi, 216, 0xFFFFFFFF);
put32(&mut mobi, 220, 0xFFFFFFFF);
put32(&mut mobi, 228, 0xFFFFFFFF);
put32(&mut mobi, 232, 0xFFFFFFFF);
put32(&mut mobi, 236, 0xFFFFFFFF);
put32(&mut mobi, 240, 0xFFFFFFFF);
if let Some(srcs_idx) = srcs_record {
put32(&mut mobi, 208, srcs_idx as u32);
put32(&mut mobi, 212, 1);
put32(&mut mobi, 244, srcs_idx as u32);
put32(&mut mobi, 248, 1);
} else {
put32(&mut mobi, 244, 0xFFFFFFFF);
put32(&mut mobi, 248, 0xFFFFFFFF);
}
put32(&mut mobi, 256, 0xFFFFFFFF);
let exth_data = if is_dictionary {
exth::build_exth(
full_name,
&opf.author,
&opf.date,
&opf.language,
&opf.dict_in_language,
&opf.dict_out_language,
headword_chars,
creator_tag,
cover_offset,
)
} else {
exth::build_book_exth(
full_name,
&opf.author,
&opf.date,
&opf.language,
cover_offset,
thumb_offset,
kf8_cover_uri,
fixed_layout,
kf8_boundary_record,
hd_geometry,
creator_tag,
doc_type,
None, None, None, None, )
};
let full_name_offset = 16 + MOBI_HEADER_LENGTH + exth_data.len();
put32(&mut mobi, 68, full_name_offset as u32);
put32(&mut mobi, 72, full_name_bytes.len() as u32);
let mut record0 = Vec::new();
record0.extend_from_slice(&palmdoc);
record0.extend_from_slice(&mobi);
record0.extend_from_slice(&exth_data);
record0.extend_from_slice(full_name_bytes);
while record0.len() % 4 != 0 {
record0.push(0x00);
}
const MIN_RECORD0_SIZE: usize = 8892;
if record0.len() < MIN_RECORD0_SIZE {
record0.resize(MIN_RECORD0_SIZE, 0x00);
}
record0
}
fn build_kf8_record0(
opf: &OPFData,
text_length: usize,
text_record_count: usize,
first_non_book_record: usize,
fdst_record: usize,
fdst_flow_count: usize,
skeleton_indx_record: usize,
fragment_indx_record: usize,
ncx_record: usize,
datp_record: usize,
flis_record: usize,
fcis_record: usize,
no_compress: bool,
cover_offset: Option<u32>,
thumb_offset: Option<u32>,
kf8_cover_uri: Option<&str>,
fixed_layout: Option<&exth::FixedLayoutMeta>,
first_image_record: usize,
creator_tag: bool,
srcs_record: Option<usize>,
hd_geometry: Option<&str>,
_total_records: usize,
doc_type: Option<&str>,
) -> Vec<u8> {
let full_name = if opf.title.is_empty() {
"Book"
} else {
&opf.title
};
let full_name_bytes = full_name.as_bytes();
let compression_type: u16 = if no_compress { 1 } else { 2 };
let mut record_size = RECORD_SIZE;
let mut text_rec_count = text_record_count;
if text_rec_count > 65000 {
record_size = std::cmp::max(RECORD_SIZE, (text_length / 65000) + 1);
text_rec_count = std::cmp::min(text_rec_count, 65535);
}
let mut palmdoc = Vec::with_capacity(16);
palmdoc.extend_from_slice(&compression_type.to_be_bytes());
palmdoc.extend_from_slice(&0u16.to_be_bytes());
palmdoc.extend_from_slice(&(text_length as u32).to_be_bytes());
palmdoc.extend_from_slice(&(text_rec_count as u16).to_be_bytes());
palmdoc.extend_from_slice(&(record_size as u16).to_be_bytes());
palmdoc.extend_from_slice(&0u16.to_be_bytes());
palmdoc.extend_from_slice(&0u16.to_be_bytes());
assert_eq!(palmdoc.len(), 16);
let mut mobi = vec![0u8; MOBI_HEADER_LENGTH];
put_bytes(&mut mobi, 0, b"MOBI");
put32(&mut mobi, 4, MOBI_HEADER_LENGTH as u32);
put32(&mut mobi, 8, 2); put32(&mut mobi, 12, 65001);
let uid_hash = md5_simple(full_name.as_bytes());
let unique_id = u32::from_be_bytes([uid_hash[0], uid_hash[1], uid_hash[2], uid_hash[3]]);
put32(&mut mobi, 16, unique_id);
put32(&mut mobi, 20, 8); put32(&mut mobi, 24, fragment_indx_record as u32); put32(&mut mobi, 28, 0xFFFFFFFF); put32(&mut mobi, 32, 0xFFFFFFFF); put32(&mut mobi, 36, 0xFFFFFFFF); for off in (40..64).step_by(4) {
put32(&mut mobi, off, 0xFFFFFFFF); }
put32(&mut mobi, 64, first_non_book_record as u32);
put32(&mut mobi, 76, locale_code(&opf.language));
put32(&mut mobi, 80, 0); put32(&mut mobi, 84, 0); put32(&mut mobi, 88, 8); put32(&mut mobi, 92, first_image_record as u32);
put32(&mut mobi, 96, 0); put32(&mut mobi, 100, 0);
put32(&mut mobi, 112, 0x50);
put32(&mut mobi, 148, 0xFFFFFFFF); put32(&mut mobi, 152, 0xFFFFFFFF);
put32(&mut mobi, 176, fdst_record as u32);
put32(&mut mobi, 180, fdst_flow_count as u32);
put32(&mut mobi, 184, fcis_record as u32);
put32(&mut mobi, 188, 1);
put32(&mut mobi, 192, flis_record as u32);
put32(&mut mobi, 196, 1);
if let Some(srcs_idx) = srcs_record {
put32(&mut mobi, 208, srcs_idx as u32);
put32(&mut mobi, 212, 1);
} else {
put32(&mut mobi, 208, 0xFFFFFFFF);
put32(&mut mobi, 212, 0);
}
put32(&mut mobi, 216, 0xFFFFFFFF);
put32(&mut mobi, 220, 0xFFFFFFFF);
put32(&mut mobi, 224, 3);
put32(&mut mobi, 228, ncx_record as u32);
put32(&mut mobi, 232, fragment_indx_record as u32);
put32(&mut mobi, 236, skeleton_indx_record as u32);
put32(&mut mobi, 240, datp_record as u32);
put32(&mut mobi, 244, 0xFFFFFFFF);
put32(&mut mobi, 248, 0xFFFFFFFF);
put32(&mut mobi, 252, 0x00000000);
put32(&mut mobi, 256, 0xFFFFFFFF);
let exth_data = exth::build_book_exth(
full_name,
&opf.author,
&opf.date,
&opf.language,
cover_offset,
thumb_offset,
kf8_cover_uri,
fixed_layout,
None, hd_geometry,
creator_tag,
doc_type,
None, None, None, None, );
let full_name_offset = 16 + MOBI_HEADER_LENGTH + exth_data.len();
put32(&mut mobi, 68, full_name_offset as u32);
put32(&mut mobi, 72, full_name_bytes.len() as u32);
let mut record0 = Vec::new();
record0.extend_from_slice(&palmdoc);
record0.extend_from_slice(&mobi);
record0.extend_from_slice(&exth_data);
record0.extend_from_slice(full_name_bytes);
while record0.len() % 4 != 0 {
record0.push(0x00);
}
const MIN_RECORD0_SIZE: usize = 8892;
if record0.len() < MIN_RECORD0_SIZE {
record0.resize(MIN_RECORD0_SIZE, 0x00);
}
record0
}
fn build_flis() -> Vec<u8> {
let mut flis = Vec::with_capacity(36);
flis.extend_from_slice(b"FLIS");
flis.extend_from_slice(&8u32.to_be_bytes());
flis.extend_from_slice(&65u16.to_be_bytes());
flis.extend_from_slice(&0u16.to_be_bytes());
flis.extend_from_slice(&0u32.to_be_bytes());
flis.extend_from_slice(&0xFFFFFFFFu32.to_be_bytes());
flis.extend_from_slice(&1u16.to_be_bytes());
flis.extend_from_slice(&3u16.to_be_bytes());
flis.extend_from_slice(&3u32.to_be_bytes());
flis.extend_from_slice(&1u32.to_be_bytes());
flis.extend_from_slice(&0xFFFFFFFFu32.to_be_bytes());
flis
}
fn build_fcis(text_length: usize, flow_count: usize) -> Vec<u8> {
let entry_count = flow_count.max(1);
let mut fcis = Vec::with_capacity(44 + (entry_count - 1) * 8);
fcis.extend_from_slice(b"FCIS");
fcis.extend_from_slice(&20u32.to_be_bytes());
fcis.extend_from_slice(&16u32.to_be_bytes());
fcis.extend_from_slice(&(entry_count as u32).to_be_bytes());
fcis.extend_from_slice(&0u32.to_be_bytes());
fcis.extend_from_slice(&(text_length as u32).to_be_bytes());
fcis.extend_from_slice(&0u32.to_be_bytes());
let block_size: u32 = if entry_count > 1 { 0x28 } else { 0x20 };
fcis.extend_from_slice(&block_size.to_be_bytes());
for _ in 1..entry_count {
fcis.extend_from_slice(&0u32.to_be_bytes());
fcis.extend_from_slice(&block_size.to_be_bytes());
}
fcis.extend_from_slice(&8u32.to_be_bytes());
fcis.extend_from_slice(&1u16.to_be_bytes());
fcis.extend_from_slice(&1u16.to_be_bytes());
fcis.extend_from_slice(&0u32.to_be_bytes());
fcis
}
fn build_cmet() -> Vec<u8> {
let version = env!("CARGO_PKG_VERSION");
format!("kindling {}", version).into_bytes()
}
fn build_eof() -> Vec<u8> {
vec![0xE9, 0x8E, 0x0D, 0x0A]
}
fn build_palmdb(title: &str, records: &[Vec<u8>]) -> Vec<u8> {
let num_records = records.len();
let header_size = 78 + num_records * 8 + 2;
let mut offsets = Vec::with_capacity(num_records);
let mut current_offset = header_size;
for rec in records {
offsets.push(current_offset);
current_offset += rec.len();
}
let strip_chars: &[char] = &[
'(', ')', '[', ']',
':', '/', '\\', '*', '?', '"', '<', '>', '|',
];
let mut palmdb_name = title.to_string();
for ch in strip_chars {
palmdb_name = palmdb_name.replace(*ch, "");
}
palmdb_name = palmdb_name
.split_whitespace()
.collect::<Vec<_>>()
.join("_");
if palmdb_name.len() > 31 {
let mut cutoff = 28.min(palmdb_name.len());
while cutoff > 0 && !palmdb_name.is_char_boundary(cutoff) {
cutoff -= 1;
}
palmdb_name.truncate(cutoff);
palmdb_name.push_str("...");
}
let mut name_bytes = [0u8; 32];
let name_raw = palmdb_name.as_bytes();
let copy_len = name_raw.len().min(31);
name_bytes[..copy_len].copy_from_slice(&name_raw[..copy_len]);
let now = palm_timestamp();
let mut header = vec![0u8; 78];
header[0..32].copy_from_slice(&name_bytes);
put16(&mut header, 32, 0); put16(&mut header, 34, 0); put32(&mut header, 36, now);
put32(&mut header, 40, now);
put32(&mut header, 44, 0); put32(&mut header, 48, 0); put32(&mut header, 52, 0); put32(&mut header, 56, 0); header[60..64].copy_from_slice(b"BOOK");
header[64..68].copy_from_slice(b"MOBI");
put32(&mut header, 68, ((num_records - 1) * 2 + 1) as u32); put32(&mut header, 72, 0); put16(&mut header, 76, num_records as u16);
let mut record_list = Vec::with_capacity(num_records * 8);
for i in 0..num_records {
record_list.extend_from_slice(&(offsets[i] as u32).to_be_bytes());
let uid = (i * 2) as u32;
let attrs_uid = uid & 0x00FFFFFF;
record_list.extend_from_slice(&attrs_uid.to_be_bytes());
}
let gap = [0u8; 2];
let total_size: usize = header.len() + record_list.len() + gap.len()
+ records.iter().map(|r| r.len()).sum::<usize>();
let mut output = Vec::with_capacity(total_size);
output.extend_from_slice(&header);
output.extend_from_slice(&record_list);
output.extend_from_slice(&gap);
for rec in records {
output.extend_from_slice(rec);
}
output
}
fn locale_code(lang: &str) -> u32 {
match lang {
"en" | "en-US" => 0x0409,
"en-GB" => 0x0809,
"el" | "el-GR" => 0x0408,
"de" | "de-DE" => 0x0407,
"fr" | "fr-FR" => 0x040C,
"es" | "es-ES" => 0x0C0A,
"it" | "it-IT" => 0x0410,
"pt" | "pt-BR" => 0x0416,
"pt-PT" => 0x0816,
"nl" | "nl-NL" => 0x0413,
"ru" | "ru-RU" => 0x0419,
"ja" | "ja-JP" => 0x0411,
"zh" | "zh-CN" => 0x0804,
"zh-TW" => 0x0404,
"ko" | "ko-KR" => 0x0412,
"ar" | "ar-SA" => 0x0401,
"he" | "he-IL" => 0x040D,
"tr" | "tr-TR" => 0x041F,
_ => 0x0409, }
}
fn palm_timestamp() -> u32 {
let unix_secs = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
(unix_secs + 2082844800) as u32
}
fn md5_simple(data: &[u8]) -> [u8; 16] {
let mut msg = data.to_vec();
let bit_len = (data.len() as u64) * 8;
msg.push(0x80);
while msg.len() % 64 != 56 {
msg.push(0x00);
}
msg.extend_from_slice(&bit_len.to_le_bytes());
let mut a0: u32 = 0x67452301;
let mut b0: u32 = 0xEFCDAB89;
let mut c0: u32 = 0x98BADCFE;
let mut d0: u32 = 0x10325476;
const S: [u32; 64] = [
7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20,
4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21,
];
const K: [u32; 64] = [
0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a, 0xa8304613,
0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, 0x6b901122, 0xfd987193,
0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, 0xd62f105d,
0x02441453, 0xd8a1e681, 0xe7d3fbc8, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122,
0xfde5380c, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa,
0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665, 0xf4292244,
0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb,
0xeb86d391,
];
for chunk in msg.chunks(64) {
let mut m = [0u32; 16];
for (i, word) in chunk.chunks(4).enumerate() {
m[i] = u32::from_le_bytes([word[0], word[1], word[2], word[3]]);
}
let mut a = a0;
let mut b = b0;
let mut c = c0;
let mut d = d0;
for i in 0..64 {
let (f, g) = match i {
0..=15 => ((b & c) | (!b & d), i),
16..=31 => ((d & b) | (!d & c), (5 * i + 1) % 16),
32..=47 => (b ^ c ^ d, (3 * i + 5) % 16),
_ => (c ^ (b | !d), (7 * i) % 16),
};
let temp = d;
d = c;
c = b;
b = b.wrapping_add(
(a.wrapping_add(f).wrapping_add(K[i]).wrapping_add(m[g])).rotate_left(S[i]),
);
a = temp;
}
a0 = a0.wrapping_add(a);
b0 = b0.wrapping_add(b);
c0 = c0.wrapping_add(c);
d0 = d0.wrapping_add(d);
}
let mut result = [0u8; 16];
result[0..4].copy_from_slice(&a0.to_le_bytes());
result[4..8].copy_from_slice(&b0.to_le_bytes());
result[8..12].copy_from_slice(&c0.to_le_bytes());
result[12..16].copy_from_slice(&d0.to_le_bytes());
result
}
fn put_bytes(buf: &mut [u8], offset: usize, data: &[u8]) {
buf[offset..offset + data.len()].copy_from_slice(data);
}
fn put16(buf: &mut [u8], offset: usize, value: u16) {
buf[offset..offset + 2].copy_from_slice(&value.to_be_bytes());
}
fn put32(buf: &mut [u8], offset: usize, value: u32) {
buf[offset..offset + 4].copy_from_slice(&value.to_be_bytes());
}
fn find_bytes(haystack: &[u8], needle: &[u8]) -> Option<usize> {
memchr::memmem::find(haystack, needle)
}
fn find_bytes_from(haystack: &[u8], needle: &[u8], start: usize) -> Option<usize> {
if start >= haystack.len() || needle.is_empty() {
return None;
}
memchr::memmem::find(&haystack[start..], needle).map(|p| p + start)
}
fn rfind_bytes(haystack: &[u8], needle: &[u8]) -> Option<usize> {
if needle.len() > haystack.len() {
return None;
}
memchr::memmem::rfind(haystack, needle)
}