use anyhow::Result;
use super::{ContentHandler, ConversionResult};
const MAX_SCAN_BYTES: usize = 2 * 1024 * 1024;
const MAX_OUTPUT_CHARS: usize = 50_000;
pub struct PdfLightHandler;
impl ContentHandler for PdfLightHandler {
fn supported_types(&self) -> &[&str] {
&["application/pdf"]
}
fn to_markdown(&self, bytes: &[u8], content_type: &str) -> Result<ConversionResult> {
let start = std::time::Instant::now();
if !is_pdf(bytes) {
anyhow::bail!("Not a PDF: missing %PDF- header");
}
let page_count = count_pages(bytes);
let text = extract_pdf_text(&bytes[..bytes.len().min(MAX_SCAN_BYTES)]);
let markdown = build_markdown(text, page_count);
Ok(ConversionResult {
markdown,
page_count: Some(page_count),
content_type: content_type.to_string(),
elapsed_ms: start.elapsed().as_secs_f64() * 1000.0,
quality: None,
})
}
}
fn is_pdf(bytes: &[u8]) -> bool {
bytes.starts_with(b"%PDF-")
}
fn count_pages(bytes: &[u8]) -> usize {
let count = count_occurrences(bytes, b"/Type /Page");
let count2 = count_occurrences(bytes, b"/Type/Page");
count.max(count2).max(1)
}
fn count_occurrences(haystack: &[u8], needle: &[u8]) -> usize {
if needle.is_empty() {
return 0;
}
let mut count = 0;
let mut i = 0;
while i + needle.len() <= haystack.len() {
if haystack[i..i + needle.len()] == *needle {
count += 1;
i += needle.len();
} else {
i += 1;
}
}
count
}
fn extract_pdf_text(bytes: &[u8]) -> Option<String> {
let mut output = String::with_capacity(4096);
let mut in_bt_block = false;
let mut pending_strings: Vec<String> = Vec::new();
let mut i = 0;
while i < bytes.len() && output.len() < MAX_OUTPUT_CHARS {
if !in_bt_block {
if bytes[i..].starts_with(b"BT") && is_pdf_token_boundary(bytes, i, 2) {
in_bt_block = true;
i += 2;
continue;
}
i += 1;
continue;
}
if bytes[i..].starts_with(b"ET") && is_pdf_token_boundary(bytes, i, 2) {
flush_strings(&mut pending_strings, &mut output);
in_bt_block = false;
i += 2;
continue;
}
if bytes[i] == b'('
&& let Some((s, consumed)) = parse_literal_string(&bytes[i..])
{
pending_strings.push(s);
i += consumed;
continue;
}
if bytes[i] == b'<'
&& bytes.get(i + 1).is_some_and(|&b| b != b'<')
&& let Some((s, consumed)) = parse_hex_string(&bytes[i..])
{
pending_strings.push(s);
i += consumed;
continue;
}
if bytes[i] == b'['
&& let Some((strings, consumed)) = parse_array_strings(&bytes[i..])
{
pending_strings.extend(strings);
i += consumed;
continue;
}
if matches!(bytes[i], b'T' | b'\'' | b'"') {
let op_end = scan_operator_end(bytes, i);
let op = &bytes[i..op_end];
match op {
b"Tj" | b"TJ" | b"'" | b"\"" => {
flush_strings(&mut pending_strings, &mut output);
}
_ => {}
}
i = op_end;
continue;
}
if bytes[i] == b'T' && i + 1 < bytes.len() && matches!(bytes[i + 1], b'd' | b'D' | b'*') {
if !output.is_empty() && !output.ends_with('\n') {
output.push('\n');
}
i += 2;
continue;
}
i += 1;
}
if output.trim().is_empty() {
None
} else {
Some(output)
}
}
fn flush_strings(pending: &mut Vec<String>, output: &mut String) {
if pending.is_empty() {
return;
}
let line: String = pending.drain(..).collect();
let trimmed = line.trim();
if !trimmed.is_empty() {
if !output.is_empty() && !output.ends_with('\n') {
output.push(' ');
}
output.push_str(trimmed);
}
}
fn is_pdf_token_boundary(bytes: &[u8], i: usize, len: usize) -> bool {
let before_ok = i == 0 || is_pdf_delimiter_or_ws(bytes[i - 1]);
let after_ok = i + len >= bytes.len() || is_pdf_delimiter_or_ws(bytes[i + len]);
before_ok && after_ok
}
fn is_pdf_delimiter_or_ws(b: u8) -> bool {
matches!(
b,
b' ' | b'\t' | b'\n' | b'\r' | b'(' | b')' | b'[' | b']' | b'{' | b'}' | b'/' | b'<' | b'>'
)
}
fn parse_literal_string(bytes: &[u8]) -> Option<(String, usize)> {
if bytes.first() != Some(&b'(') {
return None;
}
let mut result = String::new();
let mut i = 1;
let mut depth = 1usize;
while i < bytes.len() {
match bytes[i] {
b'\\' if i + 1 < bytes.len() => {
let escaped = match bytes[i + 1] {
b'n' => '\n',
b'r' => '\r',
b't' => '\t',
c => char::from(c),
};
result.push(escaped);
i += 2;
}
b'(' => {
depth += 1;
result.push('(');
i += 1;
}
b')' => {
depth -= 1;
if depth == 0 {
return Some((sanitize_pdf_string(&result), i + 1));
}
result.push(')');
i += 1;
}
b => {
if b.is_ascii_graphic() || b == b' ' {
result.push(char::from(b));
}
i += 1;
}
}
}
None }
fn parse_hex_string(bytes: &[u8]) -> Option<(String, usize)> {
if bytes.first() != Some(&b'<') {
return None;
}
let end = bytes[1..].iter().position(|&b| b == b'>')?;
let hex_slice = &bytes[1..=end];
let decoded = decode_hex_string(hex_slice);
Some((decoded, end + 2))
}
fn decode_hex_string(hex: &[u8]) -> String {
let digits: Vec<u8> = hex
.iter()
.filter(|&&b| !b.is_ascii_whitespace())
.copied()
.collect();
let mut result = String::new();
let mut j = 0;
while j < digits.len() {
let hi = hex_digit(digits[j]);
let lo = if j + 1 < digits.len() {
hex_digit(digits[j + 1])
} else {
Some(0)
};
match (hi, lo) {
(Some(h), Some(l)) => {
let byte: u8 = (h << 4) | l;
if byte.is_ascii_graphic() || byte == b' ' {
result.push(char::from(byte));
}
j += 2;
}
_ => {
j += 1;
}
}
}
result
}
fn hex_digit(b: u8) -> Option<u8> {
match b {
b'0'..=b'9' => Some(b - b'0'),
b'a'..=b'f' => Some(b - b'a' + 10),
b'A'..=b'F' => Some(b - b'A' + 10),
_ => None,
}
}
fn parse_array_strings(bytes: &[u8]) -> Option<(Vec<String>, usize)> {
if bytes.first() != Some(&b'[') {
return None;
}
let mut strings = Vec::new();
let mut i = 1;
while i < bytes.len() {
match bytes[i] {
b']' => return Some((strings, i + 1)),
b'(' => {
if let Some((s, consumed)) = parse_literal_string(&bytes[i..]) {
strings.push(s);
i += consumed;
} else {
i += 1;
}
}
b'<' if bytes.get(i + 1).is_some_and(|&b| b != b'<') => {
if let Some((s, consumed)) = parse_hex_string(&bytes[i..]) {
strings.push(s);
i += consumed;
} else {
i += 1;
}
}
_ => {
i += 1;
}
}
}
None }
fn scan_operator_end(bytes: &[u8], i: usize) -> usize {
let mut j = i;
while j < bytes.len() && !is_pdf_delimiter_or_ws(bytes[j]) {
j += 1;
}
j
}
fn sanitize_pdf_string(s: &str) -> String {
s.chars()
.map(|c| if c.is_control() && c != '\n' { ' ' } else { c })
.collect()
}
fn build_markdown(text: Option<String>, page_count: usize) -> String {
match text {
Some(extracted) if !extracted.trim().is_empty() => {
let pages_label = if page_count == 1 {
"1 page".to_string()
} else {
format!("{page_count} pages")
};
format!(
"[PDF: {pages_label}, text extracted — for full fidelity rebuild with `--features pdf`]\n\n{}",
extracted.trim()
)
}
_ => {
let pages_label = if page_count == 1 {
"1 page".to_string()
} else {
format!("{page_count} pages")
};
format!(
"[PDF: {pages_label}, scanned — no text layer detected. \
Use OCR or rebuild with `--features pdf` for pdfium extraction.]"
)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn is_pdf_returns_true_for_valid_header() {
assert!(is_pdf(b"%PDF-1.4\n"));
}
#[test]
fn is_pdf_returns_false_for_non_pdf_bytes() {
assert!(!is_pdf(b"<!DOCTYPE html>"));
}
#[test]
fn is_pdf_returns_false_for_empty_bytes() {
assert!(!is_pdf(b""));
}
#[test]
fn count_pages_returns_1_for_single_page_pdf_fragment() {
let bytes = b"%PDF-1.4\n/Type /Page\n";
let count = count_pages(bytes);
assert_eq!(count, 1);
}
#[test]
fn count_pages_returns_3_for_three_page_entries() {
let bytes = b"%PDF-1.4\n/Type /Page\n/Type /Page\n/Type /Page\n";
let count = count_pages(bytes);
assert_eq!(count, 3);
}
#[test]
fn count_pages_handles_compact_nospace_variant() {
let bytes = b"%PDF-1.4\n/Type/Page\n/Type/Page\n";
let count = count_pages(bytes);
assert_eq!(count, 2);
}
#[test]
fn parse_literal_string_decodes_simple_string() {
let (s, consumed) = parse_literal_string(b"(Hello, World!)").unwrap();
assert_eq!(s, "Hello, World!");
assert_eq!(consumed, 15);
}
#[test]
fn parse_literal_string_handles_escaped_parens() {
let (s, consumed) = parse_literal_string(b"(foo\\(bar\\)baz)").unwrap();
assert_eq!(s, "foo(bar)baz");
assert_eq!(consumed, 15);
}
#[test]
fn parse_literal_string_handles_nested_parens() {
let (s, _) = parse_literal_string(b"(outer (inner) end)").unwrap();
assert_eq!(s, "outer (inner) end");
}
#[test]
fn parse_literal_string_returns_none_for_unclosed() {
let result = parse_literal_string(b"(unclosed");
assert!(result.is_none());
}
#[test]
fn parse_hex_string_decodes_ascii_hex_pairs() {
let (s, consumed) = parse_hex_string(b"<4869>").unwrap();
assert_eq!(s, "Hi");
assert_eq!(consumed, 6);
}
#[test]
fn parse_hex_string_ignores_spaces_in_hex_content() {
let (s, _) = parse_hex_string(b"<48 65 6C 6C 6F>").unwrap();
assert_eq!(s, "Hello");
}
#[test]
fn parse_hex_string_returns_none_for_unclosed() {
let result = parse_hex_string(b"<4869");
assert!(result.is_none());
}
#[test]
fn extract_pdf_text_finds_text_in_bt_et_block() {
let pdf = b"%PDF-1.4\nBT\n(Hello PDF) Tj\nET\n";
let text = extract_pdf_text(pdf);
assert!(text.is_some());
let t = text.unwrap();
assert!(t.contains("Hello PDF"), "got: {t}");
}
#[test]
fn extract_pdf_text_returns_none_for_no_bt_blocks() {
let pdf = b"%PDF-1.4\nxref\n0 1\n0000000000 65535 f \n";
let text = extract_pdf_text(pdf);
assert!(text.is_none());
}
#[test]
fn extract_pdf_text_handles_multiple_bt_et_blocks() {
let pdf = b"%PDF-1.4\nBT\n(First line) Tj\nET\nBT\n(Second line) Tj\nET\n";
let text = extract_pdf_text(pdf);
let t = text.expect("expected text");
assert!(t.contains("First line"), "got: {t}");
assert!(t.contains("Second line"), "got: {t}");
}
#[test]
fn build_markdown_with_text_includes_extraction_note() {
let md = build_markdown(Some("Sample content".to_string()), 2);
assert!(md.contains("[PDF: 2 pages"), "got: {md}");
assert!(md.contains("Sample content"), "got: {md}");
}
#[test]
fn build_markdown_with_no_text_reports_scanned_pdf() {
let md = build_markdown(None, 5);
assert!(md.contains("[PDF:"), "got: {md}");
assert!(md.contains("scanned"), "got: {md}");
assert!(md.contains("5 pages"), "got: {md}");
}
#[test]
fn build_markdown_single_page_uses_singular_form() {
let md = build_markdown(None, 1);
assert!(md.contains("1 page"), "got: {md}");
assert!(!md.contains("1 pages"), "got: {md}");
}
#[test]
fn pdf_light_handler_returns_error_for_non_pdf_bytes() {
let handler = PdfLightHandler;
let result = handler.to_markdown(b"<html>not a pdf</html>", "application/pdf");
assert!(result.is_err());
}
#[test]
fn pdf_light_handler_extracts_text_from_simple_pdf() {
let pdf = b"%PDF-1.4\n/Type /Page\nBT\n(Test document) Tj\nET\n%%EOF";
let handler = PdfLightHandler;
let result = handler.to_markdown(pdf, "application/pdf").unwrap();
assert!(
result.markdown.contains("Test document"),
"got: {}",
result.markdown
);
assert_eq!(result.page_count, Some(1));
}
#[test]
fn pdf_light_handler_reports_scanned_for_no_text_layer() {
let pdf = b"%PDF-1.4\n/Type /Page\n/Type /Page\nxref\n%%EOF";
let handler = PdfLightHandler;
let result = handler.to_markdown(pdf, "application/pdf").unwrap();
assert!(
result.markdown.contains("scanned"),
"got: {}",
result.markdown
);
}
#[test]
fn pdf_light_supported_types_is_application_pdf() {
let handler = PdfLightHandler;
assert_eq!(handler.supported_types(), &["application/pdf"]);
}
#[test]
fn count_occurrences_finds_no_match_in_empty_haystack() {
assert_eq!(count_occurrences(b"", b"needle"), 0);
}
#[test]
fn count_occurrences_finds_multiple_non_overlapping() {
let hay = b"abcabcabc";
assert_eq!(count_occurrences(hay, b"abc"), 3);
}
}