use quick_xml::events::Event;
use quick_xml::Reader;
pub const VOID_ELEMENTS: &[&str] = &[
"area", "base", "br", "col", "embed", "hr", "img", "input", "link",
"meta", "param", "source", "track", "wbr",
"mbp:pagebreak",
"guide",
];
pub fn parse_mobi_html(content: &[u8]) -> Result<(), String> {
let content_str = std::str::from_utf8(content)
.map_err(|e| format!("text blob is not valid UTF-8: {}", e))?;
let mut reader = Reader::from_str(content_str);
{
let cfg = reader.config_mut();
cfg.check_end_names = false;
cfg.allow_unmatched_ends = true;
cfg.check_comments = false;
cfg.trim_text_start = false;
cfg.trim_text_end = false;
}
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Eof) => return Ok(()),
Err(e) => {
return Err(format!(
"XML parse error at byte {}: {}",
reader.buffer_position(),
e
));
}
Ok(_) => {}
}
buf.clear();
}
}
pub fn check_balanced_tags(content: &[u8]) -> Result<(), String> {
let content_str = std::str::from_utf8(content)
.map_err(|e| format!("text blob is not valid UTF-8: {}", e))?;
let mut reader = Reader::from_str(content_str);
{
let cfg = reader.config_mut();
cfg.check_end_names = false;
cfg.allow_unmatched_ends = true;
cfg.check_comments = false;
cfg.trim_text_start = false;
cfg.trim_text_end = false;
}
let mut stack: Vec<String> = Vec::new();
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Eof) => break,
Ok(Event::Start(e)) => {
let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
if VOID_ELEMENTS.iter().any(|v| v.eq_ignore_ascii_case(&name)) {
} else {
stack.push(name);
}
}
Ok(Event::End(e)) => {
let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
if VOID_ELEMENTS.iter().any(|v| v.eq_ignore_ascii_case(&name)) {
continue;
}
match stack.pop() {
Some(open) => {
if !open.eq_ignore_ascii_case(&name) {
return Err(format!(
"mismatched close </{}>, expected </{}>",
name, open
));
}
}
None => {
return Err(format!(
"close </{}> with no matching open",
name
));
}
}
}
Ok(Event::Empty(_)) => {
}
Err(e) => {
return Err(format!(
"walker parse error at byte {}: {}",
reader.buffer_position(),
e
));
}
_ => {}
}
buf.clear();
}
if !stack.is_empty() {
return Err(format!(
"unclosed tags at EOF: {:?}",
stack
));
}
Ok(())
}
pub fn check_no_corruption(content: &[u8]) -> Result<(), String> {
let needle = b"<hr/";
let mut i = 0;
while i + needle.len() < content.len() {
if &content[i..i + needle.len()] == needle {
let next = content[i + needle.len()];
if next != b'>' {
let end = (i + 20).min(content.len());
return Err(format!(
"malformed `<hr/` at byte {}: {:?}",
i,
String::from_utf8_lossy(&content[i..end])
));
}
i += needle.len() + 1;
} else {
i += 1;
}
}
let bytes = content;
let mut i = 0;
while i + 1 < bytes.len() {
if bytes[i] == b'=' && bytes[i + 1] == b'"' {
let start = i + 2;
let window_end = (start + 4096).min(bytes.len());
let mut found = false;
for j in start..window_end {
if bytes[j] == b'"' {
found = true;
i = j + 1;
break;
}
if bytes[j] == b'<' {
let end = (i + 60).min(bytes.len());
return Err(format!(
"unclosed attribute quote at byte {}: {:?}",
i,
String::from_utf8_lossy(&bytes[i..end])
));
}
}
if !found {
let end = (i + 60).min(bytes.len());
return Err(format!(
"unclosed attribute quote (no `\"` within 4096 bytes) at byte {}: {:?}",
i,
String::from_utf8_lossy(&bytes[i..end])
));
}
} else {
i += 1;
}
}
Ok(())
}
pub fn validate_text_blob(blob: &[u8]) -> Vec<String> {
let mut errors = Vec::new();
if let Err(e) = parse_mobi_html(blob) {
errors.push(format!("HTML parse: {}", e));
}
if let Err(e) = check_no_corruption(blob) {
errors.push(format!("corruption scan: {}", e));
}
if let Err(e) = check_balanced_tags(blob) {
errors.push(format!("tag balance: {}", e));
}
errors
}
pub fn validate_records(
blob: &[u8],
records: &[(usize, usize)],
max_issues: usize,
) -> Vec<String> {
let mut issues = Vec::new();
let mut unbalanced_b = 0usize;
let mut unbalanced_i = 0usize;
let mut unbalanced_p = 0usize;
let mut unbalanced_h5 = 0usize;
let mut mid_tag = 0usize;
for (idx, &(s, e)) in records.iter().enumerate() {
if e > blob.len() || s > e {
continue;
}
let rec = &blob[s..e];
let rec_str = match std::str::from_utf8(rec) {
Ok(s) => s,
Err(_) => continue,
};
for (tag, counter) in [
("b", &mut unbalanced_b),
("i", &mut unbalanced_i),
("p", &mut unbalanced_p),
("h5", &mut unbalanced_h5),
] {
let opens = rec_str.matches(&format!("<{}>", tag)).count() as i32;
let closes = rec_str.matches(&format!("</{}>", tag)).count() as i32;
if opens != closes {
*counter += 1;
if issues.len() < max_issues {
issues.push(format!(
"record {} ({}..{}): <{}> unbalanced (opens={}, closes={})",
idx, s, e, tag, opens, closes
));
}
}
}
if record_ends_in_tag(rec) {
mid_tag += 1;
if issues.len() < max_issues {
let tail_start = rec.len().saturating_sub(40);
let tail = String::from_utf8_lossy(&rec[tail_start..]);
issues.push(format!(
"record {} ({}..{}): ends inside an HTML tag, tail={:?}",
idx, s, e, tail
));
}
}
}
if unbalanced_b + unbalanced_i + unbalanced_p + unbalanced_h5 + mid_tag > 0 {
issues.push(format!(
"summary: {}/{} records unbalanced-<b>, {} unbalanced-<i>, {} unbalanced-<p>, {} unbalanced-<h5>, {} mid-tag",
unbalanced_b, records.len(), unbalanced_i, unbalanced_p, unbalanced_h5, mid_tag
));
}
issues
}
fn record_ends_in_tag(rec: &[u8]) -> bool {
let mut in_tag = false;
for &b in rec {
if b == b'<' {
in_tag = true;
} else if b == b'>' {
in_tag = false;
}
}
in_tag
}
pub fn print_self_check_warnings(issues: &[String]) {
eprintln!("Warning: MOBI output self-check found issues:");
for issue in issues {
eprintln!(" - {}", issue);
}
eprintln!(
"These may indicate a kindling bug. Please report at \
https://github.com/ciscoriordan/kindling/issues"
);
eprintln!(
"The MOBI will still be written; use --no-self-check to suppress these warnings."
);
}