use std::collections::HashMap;
use crate::ast::{DocumentMetadata, Warning};
#[derive(Debug, Clone, PartialEq)]
pub struct FontInfo {
pub name: String,
pub size: Option<f32>,
pub font_weight: Option<f32>,
pub italic_angle: Option<f32>,
}
#[derive(Debug, Clone, PartialEq)]
pub struct RawTextSegment {
pub text: String,
pub font_resource_name: Vec<u8>,
pub font_size: f32,
pub page_number: usize,
}
pub fn load_pdf(bytes: &[u8]) -> (Option<lopdf::Document>, Vec<Warning>) {
if bytes.is_empty() {
return (
None,
vec![Warning::MalformedPdfObject {
detail: "empty PDF bytes".to_string(),
}],
);
}
match lopdf::Document::load_mem(bytes) {
Ok(doc) => (Some(doc), Vec::new()),
Err(e) => (
None,
vec![Warning::MalformedPdfObject {
detail: format!("failed to load PDF: {}", e),
}],
),
}
}
fn extract_font_descriptor_metrics(
doc: &lopdf::Document,
font_dict: &lopdf::Dictionary,
) -> (Option<f32>, Option<f32>) {
let descriptor = font_dict
.get(b"FontDescriptor")
.ok()
.and_then(|obj| match obj {
lopdf::Object::Reference(id) => doc.get_object(*id).ok(),
other => Some(other),
})
.and_then(|obj| obj.as_dict().ok());
let Some(desc) = descriptor else {
return (None, None);
};
let font_weight = desc.get(b"FontWeight").ok().and_then(extract_number);
let italic_angle = desc.get(b"ItalicAngle").ok().and_then(extract_number);
(font_weight, italic_angle)
}
pub(crate) fn strip_subset_prefix(name: &str) -> &str {
if name.len() >= 7
&& name.as_bytes()[6] == b'+'
&& name[..6].bytes().all(|b| b.is_ascii_uppercase())
{
&name[7..]
} else {
name
}
}
pub fn resolve_fonts_for_page(
doc: &lopdf::Document,
page_number: usize,
) -> (HashMap<Vec<u8>, FontInfo>, Vec<Warning>) {
let mut fonts = HashMap::new();
let mut warnings = Vec::new();
let pages = doc.get_pages();
let page_num_u32 = match u32::try_from(page_number) {
Ok(n) => n,
Err(_) => {
warnings.push(Warning::MalformedPdfObject {
detail: format!("page number {} exceeds u32 range", page_number),
});
return (fonts, warnings);
}
};
let page_id = match pages.get(&page_num_u32) {
Some(id) => *id,
None => {
warnings.push(Warning::MalformedPdfObject {
detail: format!(
"page {} not found (document has {} pages)",
page_number,
pages.len()
),
});
return (fonts, warnings);
}
};
let page_fonts = match doc.get_page_fonts(page_id) {
Ok(f) => f,
Err(e) => {
warnings.push(Warning::MalformedPdfObject {
detail: format!(
"failed to read font resources for page {}: {}",
page_number, e
),
});
return (fonts, warnings);
}
};
for (resource_name, font_dict) in page_fonts {
let base_font_name = match font_dict.get(b"BaseFont") {
Ok(obj) => match obj.as_name() {
Ok(name_bytes) => {
let raw_name = String::from_utf8_lossy(name_bytes).to_string();
strip_subset_prefix(&raw_name).to_string()
}
Err(_) => {
warnings.push(Warning::MissingFontMetrics {
font_name: "<unknown>".to_string(),
page: page_number,
});
continue;
}
},
Err(_) => {
warnings.push(Warning::MissingFontMetrics {
font_name: "<unknown>".to_string(),
page: page_number,
});
continue;
}
};
let (font_weight, italic_angle) = extract_font_descriptor_metrics(doc, &font_dict);
fonts.insert(
resource_name,
FontInfo {
name: base_font_name,
size: None,
font_weight,
italic_angle,
},
);
}
(fonts, warnings)
}
const WINANSI_0X80_TO_0X9F: [char; 32] = [
'\u{20AC}', '\u{FFFD}', '\u{201A}', '\u{0192}', '\u{201E}', '\u{2026}', '\u{2020}', '\u{2021}', '\u{02C6}', '\u{2030}', '\u{0160}', '\u{2039}', '\u{0152}', '\u{FFFD}', '\u{017D}', '\u{FFFD}', '\u{FFFD}', '\u{2018}', '\u{2019}', '\u{201C}', '\u{201D}', '\u{2022}', '\u{2013}', '\u{2014}', '\u{02DC}', '\u{2122}', '\u{0161}', '\u{203A}', '\u{0153}', '\u{FFFD}', '\u{017E}', '\u{0178}', ];
fn winansi_byte_to_char(b: u8) -> char {
if b < 0x80 {
b as char
} else if b <= 0x9F {
WINANSI_0X80_TO_0X9F[(b - 0x80) as usize]
} else {
b as char
}
}
fn decode_pdf_string(bytes: &[u8]) -> String {
if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
return decode_utf16be(&bytes[2..]);
}
if bytes.len() >= 2 && bytes.len().is_multiple_of(2) && bytes[0] == 0x00 {
return decode_utf16be(bytes);
}
bytes.iter().map(|&b| winansi_byte_to_char(b)).collect()
}
fn decode_utf16be(bytes: &[u8]) -> String {
let u16_iter = bytes
.chunks_exact(2)
.map(|pair| u16::from_be_bytes([pair[0], pair[1]]));
char::decode_utf16(u16_iter)
.map(|r| r.unwrap_or(char::REPLACEMENT_CHARACTER))
.collect()
}
pub fn extract_text_segments_for_page(
doc: &lopdf::Document,
page_number: usize,
_fonts: &HashMap<Vec<u8>, FontInfo>,
) -> (Vec<RawTextSegment>, Vec<Warning>) {
let mut segments = Vec::new();
let mut warnings = Vec::new();
let pages = doc.get_pages();
let page_num_u32 = match u32::try_from(page_number) {
Ok(n) => n,
Err(_) => return (segments, warnings),
};
let page_id = match pages.get(&page_num_u32) {
Some(id) => *id,
None => return (segments, warnings),
};
let content = match doc.get_and_decode_page_content(page_id) {
Ok(c) => c,
Err(e) => {
warnings.push(Warning::UnreadableTextStream {
page: page_number,
detail: format!("failed to decode content stream: {}", e),
});
return (segments, warnings);
}
};
let mut current_font_resource: Option<Vec<u8>> = None;
let mut current_font_size: Option<f32> = None;
let mut tf_set_in_text_object = false;
let mut warned_no_tf = false;
for op in content.operations.iter() {
match op.operator.as_str() {
"BT" => {
current_font_resource = None;
current_font_size = None;
tf_set_in_text_object = false;
warned_no_tf = false;
}
"ET" => {
current_font_resource = None;
current_font_size = None;
tf_set_in_text_object = false;
warned_no_tf = false;
}
"Tf" => {
if op.operands.len() >= 2 {
if let Some(name_bytes) = extract_name(&op.operands[0]) {
current_font_resource = Some(name_bytes);
}
if let Some(size) = extract_number(&op.operands[1]) {
current_font_size = Some(size);
}
tf_set_in_text_object = true;
}
}
"Tj" => {
if let Some(text_bytes) = op.operands.first().and_then(extract_string_bytes) {
let text = decode_pdf_string(&text_bytes);
if !text.is_empty() {
let (font_res, font_sz) = get_text_state_or_default(
¤t_font_resource,
current_font_size,
tf_set_in_text_object,
&mut warned_no_tf,
page_number,
&mut warnings,
);
segments.push(RawTextSegment {
text,
font_resource_name: font_res,
font_size: font_sz,
page_number,
});
}
}
}
"TJ" => {
if let Some(lopdf::Object::Array(arr)) = op.operands.first() {
let mut combined = String::new();
let mut prev_was_string = false;
let mut needs_space = false;
for item in arr {
if let Some(bytes) = extract_string_bytes(item) {
let text = decode_pdf_string(&bytes);
if needs_space
&& !combined.is_empty()
&& !combined.ends_with(char::is_whitespace)
&& !text.starts_with(char::is_whitespace)
{
combined.push(' ');
}
combined.push_str(&text);
prev_was_string = true;
needs_space = false;
} else if let Some(num) = extract_number(item) {
if prev_was_string && num < -100.0 {
needs_space = true;
}
}
}
if !combined.is_empty() {
let (font_res, font_sz) = get_text_state_or_default(
¤t_font_resource,
current_font_size,
tf_set_in_text_object,
&mut warned_no_tf,
page_number,
&mut warnings,
);
segments.push(RawTextSegment {
text: combined,
font_resource_name: font_res,
font_size: font_sz,
page_number,
});
}
}
}
_ => {
}
}
}
(segments, warnings)
}
fn get_text_state_or_default(
current_font_resource: &Option<Vec<u8>>,
current_font_size: Option<f32>,
tf_set: bool,
warned_no_tf: &mut bool,
page_number: usize,
warnings: &mut Vec<Warning>,
) -> (Vec<u8>, f32) {
if tf_set {
(
current_font_resource
.clone()
.unwrap_or_else(|| b"<unknown>".to_vec()),
current_font_size.unwrap_or(0.0),
)
} else {
if !*warned_no_tf {
warnings.push(Warning::MalformedPdfObject {
detail: format!("text state not set before Tj/TJ on page {}", page_number),
});
*warned_no_tf = true;
}
(b"<unknown>".to_vec(), 0.0)
}
}
fn extract_name(obj: &lopdf::Object) -> Option<Vec<u8>> {
match obj {
lopdf::Object::Name(n) => Some(n.clone()),
_ => None,
}
}
fn extract_number(obj: &lopdf::Object) -> Option<f32> {
match obj {
lopdf::Object::Real(f) => Some(*f),
lopdf::Object::Integer(i) => Some(*i as f32),
_ => None,
}
}
fn extract_string_bytes(obj: &lopdf::Object) -> Option<Vec<u8>> {
match obj {
lopdf::Object::String(bytes, _) => Some(bytes.clone()),
_ => None,
}
}
pub fn parse_pdf(bytes: &[u8]) -> (Vec<RawTextSegment>, DocumentMetadata, Vec<Warning>) {
let mut all_segments = Vec::new();
let mut all_warnings = Vec::new();
let (doc_opt, load_warnings) = load_pdf(bytes);
all_warnings.extend(load_warnings);
let doc = match doc_opt {
Some(d) => d,
None => {
return (
all_segments,
DocumentMetadata {
title: None,
author: None,
page_count: 0,
},
all_warnings,
);
}
};
let pages = doc.get_pages();
let page_count = pages.len();
let (title, author) = extract_doc_info(&doc);
let metadata = DocumentMetadata {
title,
author,
page_count,
};
let mut page_numbers: Vec<u32> = pages.keys().copied().collect();
page_numbers.sort();
for &page_num in &page_numbers {
let page_number = page_num as usize;
let (fonts, font_warnings) = resolve_fonts_for_page(&doc, page_number);
all_warnings.extend(font_warnings);
let (segments, extract_warnings) =
extract_text_segments_for_page(&doc, page_number, &fonts);
all_warnings.extend(extract_warnings);
all_segments.extend(segments);
}
(all_segments, metadata, all_warnings)
}
pub(crate) fn extract_doc_info_pub(doc: &lopdf::Document) -> (Option<String>, Option<String>) {
extract_doc_info(doc)
}
fn extract_doc_info(doc: &lopdf::Document) -> (Option<String>, Option<String>) {
let info_dict = doc
.trailer
.get(b"Info")
.ok()
.and_then(|obj| match obj {
lopdf::Object::Reference(id) => doc.get_object(*id).ok(),
_ => Some(obj),
})
.and_then(|obj| obj.as_dict().ok());
let info = match info_dict {
Some(d) => d,
None => return (None, None),
};
let title = get_info_string(info, b"Title");
let author = get_info_string(info, b"Author");
(title, author)
}
fn get_info_string(info: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
info.get(key).ok().and_then(|obj| match obj {
lopdf::Object::String(bytes, _) => {
let s = decode_pdf_string(bytes);
if s.is_empty() {
None
} else {
Some(s)
}
}
_ => None,
})
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
fn fixture_path(name: &str) -> PathBuf {
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
manifest_dir
.parent()
.expect("workspace root")
.join("tests")
.join("fixtures")
.join(name)
}
#[test]
fn load_pdf_empty_bytes_returns_none_with_warning() {
let (doc, warnings) = load_pdf(b"");
assert!(doc.is_none(), "empty bytes should not produce a document");
assert!(!warnings.is_empty(), "should emit at least one warning");
match &warnings[0] {
Warning::MalformedPdfObject { detail } => {
assert!(!detail.is_empty(), "detail should be non-empty");
}
other => panic!("expected MalformedPdfObject, got {:?}", other),
}
}
#[test]
fn load_pdf_invalid_header_returns_none_with_warning() {
let (doc, warnings) = load_pdf(b"this is not a PDF");
assert!(
doc.is_none(),
"invalid header should not produce a document"
);
assert!(!warnings.is_empty(), "should emit at least one warning");
match &warnings[0] {
Warning::MalformedPdfObject { detail } => {
assert!(!detail.is_empty(), "detail should be non-empty");
}
other => panic!("expected MalformedPdfObject, got {:?}", other),
}
}
#[test]
fn load_pdf_corrupted_fixture_returns_none_with_warning() {
let path = fixture_path("corrupted.pdf");
let bytes = std::fs::read(&path)
.unwrap_or_else(|e| panic!("corrupted.pdf fixture must exist at {:?}: {}", path, e));
let (doc, warnings) = load_pdf(&bytes);
assert!(doc.is_none(), "corrupted PDF should not produce a document");
assert!(!warnings.is_empty(), "should emit at least one warning");
match &warnings[0] {
Warning::MalformedPdfObject { detail } => {
assert!(!detail.is_empty(), "detail should be non-empty");
}
other => panic!("expected MalformedPdfObject, got {:?}", other),
}
}
#[test]
fn load_pdf_valid_simple_fixture_returns_some() {
let path = fixture_path("simple.pdf");
let bytes = std::fs::read(&path)
.unwrap_or_else(|e| panic!("simple.pdf fixture must exist at {:?}: {}", path, e));
let (doc, warnings) = load_pdf(&bytes);
assert!(doc.is_some(), "valid PDF should produce a document");
for w in &warnings {
if let Warning::MalformedPdfObject { .. } = w {
panic!("valid PDF should not produce MalformedPdfObject warning");
}
}
}
fn load_fixture(name: &str) -> lopdf::Document {
let path = fixture_path(name);
let bytes = std::fs::read(&path)
.unwrap_or_else(|e| panic!("fixture {} must exist at {:?}: {}", name, path, e));
let (doc, _) = load_pdf(&bytes);
doc.expect("fixture should be a valid PDF")
}
#[test]
fn resolve_fonts_for_page_simple_returns_font_entries() {
let doc = load_fixture("simple.pdf");
let (fonts, warnings) = resolve_fonts_for_page(&doc, 1);
assert!(
!fonts.is_empty(),
"simple.pdf page 1 should have font entries"
);
let has_helvetica = fonts.values().any(|f| f.name.contains("Helvetica"));
assert!(
has_helvetica,
"simple.pdf should have a Helvetica font, got: {:?}",
fonts
);
let malformed_warnings: Vec<_> = warnings
.iter()
.filter(|w| matches!(w, Warning::MissingFontMetrics { .. }))
.collect();
assert!(
malformed_warnings.is_empty(),
"well-formed page should not produce MissingFontMetrics warnings"
);
}
#[test]
fn resolve_fonts_for_page_bold_italic_returns_bold_and_italic_fonts() {
let doc = load_fixture("bold-italic.pdf");
let (fonts, _) = resolve_fonts_for_page(&doc, 1);
assert!(
!fonts.is_empty(),
"bold-italic.pdf page 1 should have font entries"
);
let names: Vec<&str> = fonts.values().map(|f| f.name.as_str()).collect();
let has_bold = names.iter().any(|n| n.to_lowercase().contains("bold"));
let has_italic = names
.iter()
.any(|n| n.to_lowercase().contains("oblique") || n.to_lowercase().contains("italic"));
assert!(
has_bold,
"bold-italic.pdf should have a Bold font, got: {:?}",
names
);
assert!(
has_italic,
"bold-italic.pdf should have an Oblique/Italic font, got: {:?}",
names
);
}
#[test]
fn resolve_fonts_for_page_preserves_resource_names_as_keys() {
let doc = load_fixture("simple.pdf");
let (fonts, _) = resolve_fonts_for_page(&doc, 1);
for key in fonts.keys() {
assert!(
!key.is_empty(),
"font resource name key should not be empty"
);
}
}
#[test]
fn extract_text_segments_for_page_simple_returns_segments() {
let doc = load_fixture("simple.pdf");
let (fonts, _) = resolve_fonts_for_page(&doc, 1);
let (segments, warnings) = extract_text_segments_for_page(&doc, 1, &fonts);
assert!(
!segments.is_empty(),
"simple.pdf page 1 should produce text segments"
);
for seg in &segments {
assert_eq!(seg.page_number, 1, "all segments should be page 1");
}
let combined: String = segments.iter().map(|s| s.text.as_str()).collect();
assert!(
combined.contains("Chapter 1"),
"simple.pdf should contain 'Chapter 1', got: {:?}",
combined
);
for w in &warnings {
if let Warning::UnreadableTextStream { .. } = w {
panic!("well-formed page should not produce UnreadableTextStream");
}
}
}
#[test]
fn extract_text_segments_for_page_bold_italic_returns_different_fonts() {
let doc = load_fixture("bold-italic.pdf");
let (fonts, _) = resolve_fonts_for_page(&doc, 1);
let (segments, _) = extract_text_segments_for_page(&doc, 1, &fonts);
assert!(
!segments.is_empty(),
"bold-italic.pdf page 1 should produce text segments"
);
let unique_fonts: std::collections::HashSet<&Vec<u8>> =
segments.iter().map(|s| &s.font_resource_name).collect();
assert!(
unique_fonts.len() >= 2,
"bold-italic.pdf should use at least 2 different fonts, got: {:?}",
unique_fonts
);
}
#[test]
fn extract_text_segments_for_page_font_size_comes_from_tf_state() {
let doc = load_fixture("simple.pdf");
let (fonts, _) = resolve_fonts_for_page(&doc, 1);
let (segments, _) = extract_text_segments_for_page(&doc, 1, &fonts);
for seg in &segments {
assert!(
seg.font_size > 0.0,
"font_size should be positive (from Tf), got: {}",
seg.font_size
);
}
}
#[test]
fn extract_text_segments_for_page_preserves_operator_encounter_order() {
let doc = load_fixture("simple.pdf");
let (fonts, _) = resolve_fonts_for_page(&doc, 1);
let (segments, _) = extract_text_segments_for_page(&doc, 1, &fonts);
let combined: String = segments.iter().map(|s| s.text.as_str()).collect();
if combined.contains("Chapter 1") && combined.contains("Body text.") {
let chapter_pos = combined.find("Chapter 1").unwrap();
let body_pos = combined.find("Body text.").unwrap();
assert!(
chapter_pos < body_pos,
"Chapter 1 should come before Body text. in operator order"
);
}
}
#[test]
fn get_text_state_or_default_with_tf_set_returns_current_state() {
let font_res = Some(b"F1".to_vec());
let mut warned = false;
let mut warnings = Vec::new();
let (res, size) =
get_text_state_or_default(&font_res, Some(12.0), true, &mut warned, 1, &mut warnings);
assert_eq!(res, b"F1");
assert_eq!(size, 12.0);
assert!(
warnings.is_empty(),
"should not emit warning when Tf is set"
);
assert!(!warned, "warned flag should remain false");
}
#[test]
fn get_text_state_or_default_without_tf_returns_defaults_and_warns_once() {
let mut warned = false;
let mut warnings = Vec::new();
let (res1, size1) =
get_text_state_or_default(&None, None, false, &mut warned, 1, &mut warnings);
assert_eq!(res1, b"<unknown>");
assert_eq!(size1, 0.0);
assert_eq!(warnings.len(), 1, "should emit exactly one warning");
match &warnings[0] {
Warning::MalformedPdfObject { detail } => {
assert!(detail.contains("text state not set before Tj/TJ"));
}
other => panic!("expected MalformedPdfObject, got {:?}", other),
}
assert!(warned, "warned flag should be set after first call");
let (res2, size2) =
get_text_state_or_default(&None, None, false, &mut warned, 1, &mut warnings);
assert_eq!(res2, b"<unknown>");
assert_eq!(size2, 0.0);
assert_eq!(
warnings.len(),
1,
"should still have exactly one warning after second call"
);
}
#[test]
fn get_text_state_or_default_warned_resets_across_text_objects() {
let mut warned = false;
let mut warnings = Vec::new();
get_text_state_or_default(&None, None, false, &mut warned, 1, &mut warnings);
assert_eq!(warnings.len(), 1);
warned = false;
get_text_state_or_default(&None, None, false, &mut warned, 1, &mut warnings);
assert_eq!(
warnings.len(),
2,
"should have two warnings for two separate text objects"
);
}
#[test]
fn parse_pdf_simple_returns_metadata_and_segments() {
let path = fixture_path("simple.pdf");
let bytes = std::fs::read(&path).unwrap();
let (segments, metadata, warnings) = parse_pdf(&bytes);
assert_eq!(metadata.page_count, 1, "simple.pdf has 1 page");
assert!(!segments.is_empty(), "should produce segments");
for seg in &segments {
assert_eq!(seg.page_number, 1, "all segments should be page 1");
}
let combined: String = segments.iter().map(|s| s.text.as_str()).collect();
assert!(combined.contains("Chapter 1"), "should contain 'Chapter 1'");
assert!(
combined.contains("Body text."),
"should contain 'Body text.'"
);
for w in &warnings {
if let Warning::UnreadableTextStream { .. } = w {
panic!("valid PDF should not produce UnreadableTextStream");
}
}
}
#[test]
fn parse_pdf_failed_load_returns_empty_with_warning() {
let (segments, metadata, warnings) = parse_pdf(b"not a pdf");
assert!(
segments.is_empty(),
"failed load should produce no segments"
);
assert_eq!(
metadata.page_count, 0,
"failed load should have page_count=0"
);
assert!(metadata.title.is_none(), "failed load should have no title");
assert!(
metadata.author.is_none(),
"failed load should have no author"
);
assert!(!warnings.is_empty(), "failed load should produce warnings");
match &warnings[0] {
Warning::MalformedPdfObject { detail } => {
assert!(!detail.is_empty());
}
other => panic!("expected MalformedPdfObject, got {:?}", other),
}
}
#[test]
fn parse_pdf_empty_bytes_returns_empty_with_warning() {
let (segments, metadata, warnings) = parse_pdf(b"");
assert!(segments.is_empty());
assert_eq!(metadata.page_count, 0);
assert!(!warnings.is_empty());
match &warnings[0] {
Warning::MalformedPdfObject { .. } => {}
other => panic!("expected MalformedPdfObject, got {:?}", other),
}
}
#[test]
fn parse_pdf_corrupted_fixture_returns_empty_with_warning() {
let path = fixture_path("corrupted.pdf");
let bytes = std::fs::read(&path).unwrap();
let (segments, metadata, warnings) = parse_pdf(&bytes);
assert!(
segments.is_empty(),
"corrupted PDF should produce no segments"
);
assert_eq!(metadata.page_count, 0);
assert!(!warnings.is_empty());
match &warnings[0] {
Warning::MalformedPdfObject { detail } => {
assert!(!detail.is_empty());
}
other => panic!("expected MalformedPdfObject, got {:?}", other),
}
}
#[test]
fn parse_pdf_multi_page_has_1_based_page_numbers() {
let path = fixture_path("multi-page.pdf");
let bytes = std::fs::read(&path).unwrap();
let (segments, metadata, _) = parse_pdf(&bytes);
assert!(
metadata.page_count >= 2,
"multi-page.pdf should have 2+ pages"
);
let min_page = segments.iter().map(|s| s.page_number).min().unwrap_or(0);
assert_eq!(min_page, 1, "minimum page number should be 1 (1-based)");
let max_page = segments.iter().map(|s| s.page_number).max().unwrap_or(0);
assert!(
max_page >= 2,
"multi-page.pdf should have segments from page 2+, got max={}",
max_page
);
}
#[test]
fn parse_pdf_aggregates_warnings_in_stable_order() {
let path = fixture_path("simple.pdf");
let bytes = std::fs::read(&path).unwrap();
let (_, _, warnings1) = parse_pdf(&bytes);
let (_, _, warnings2) = parse_pdf(&bytes);
assert_eq!(
warnings1.len(),
warnings2.len(),
"warnings should be deterministic"
);
assert_eq!(
warnings1, warnings2,
"warnings should be stable across runs"
);
}
#[test]
fn decode_pdf_string_winansi_ascii() {
let result = decode_pdf_string(b"Hello World");
assert_eq!(result, "Hello World");
}
#[test]
fn decode_pdf_string_winansi_high_latin_range() {
let result = decode_pdf_string(&[0xE9, 0xFC, 0xF1]);
assert_eq!(result, "\u{00E9}\u{00FC}\u{00F1}");
assert!(
!result.contains(char::REPLACEMENT_CHARACTER),
"valid WinAnsi high-latin should not produce replacement chars"
);
}
#[test]
fn decode_pdf_string_winansi_0x80_to_0x9f_range() {
let result = decode_pdf_string(&[0x80, 0x93, 0x96, 0x97]);
assert_eq!(result, "\u{20AC}\u{201C}\u{2013}\u{2014}");
}
#[test]
fn decode_pdf_string_winansi_undefined_bytes_use_replacement() {
let result = decode_pdf_string(&[0x81, 0x8D]);
assert_eq!(result, "\u{FFFD}\u{FFFD}");
}
#[test]
fn decode_pdf_string_utf16be_with_bom() {
let bytes = [0xFE, 0xFF, 0x00, 0x48, 0x00, 0x69];
let result = decode_pdf_string(&bytes);
assert_eq!(result, "Hi");
assert!(
!result.contains(char::REPLACEMENT_CHARACTER),
"valid UTF-16BE should not produce replacement chars"
);
}
#[test]
fn decode_pdf_string_utf16be_without_bom() {
let bytes = [0x00, 0x41, 0x00, 0x42];
let result = decode_pdf_string(&bytes);
assert_eq!(result, "AB");
}
#[test]
fn decode_pdf_string_utf16be_with_non_ascii() {
let bytes = [0xFE, 0xFF, 0x00, 0x63, 0x00, 0x61, 0x00, 0x66, 0x00, 0xE9];
let result = decode_pdf_string(&bytes);
assert_eq!(result, "caf\u{00E9}");
}
#[test]
fn decode_pdf_string_empty_bytes() {
let result = decode_pdf_string(b"");
assert_eq!(result, "");
}
#[test]
fn decode_pdf_string_single_byte_not_utf16() {
let result = decode_pdf_string(&[0x41]);
assert_eq!(result, "A");
}
#[test]
fn decode_utf16be_invalid_surrogate_uses_replacement() {
let bytes = [0xD8, 0x00];
let result = decode_utf16be(&bytes);
assert!(
result.contains(char::REPLACEMENT_CHARACTER),
"invalid surrogate should produce replacement char, got: {:?}",
result
);
}
#[test]
fn winansi_byte_to_char_ascii_range() {
assert_eq!(winansi_byte_to_char(0x41), 'A');
assert_eq!(winansi_byte_to_char(0x20), ' ');
assert_eq!(winansi_byte_to_char(0x7F), '\x7F');
}
#[test]
fn winansi_byte_to_char_special_range() {
assert_eq!(winansi_byte_to_char(0x80), '\u{20AC}'); assert_eq!(winansi_byte_to_char(0x91), '\u{2018}'); assert_eq!(winansi_byte_to_char(0x92), '\u{2019}'); assert_eq!(winansi_byte_to_char(0x93), '\u{201C}'); assert_eq!(winansi_byte_to_char(0x94), '\u{201D}'); assert_eq!(winansi_byte_to_char(0x96), '\u{2013}'); assert_eq!(winansi_byte_to_char(0x97), '\u{2014}'); assert_eq!(winansi_byte_to_char(0x99), '\u{2122}'); }
#[test]
fn winansi_byte_to_char_high_latin_range() {
assert_eq!(winansi_byte_to_char(0xA0), '\u{00A0}'); assert_eq!(winansi_byte_to_char(0xE9), '\u{00E9}'); assert_eq!(winansi_byte_to_char(0xFF), '\u{00FF}'); }
#[test]
fn strip_subset_prefix_strips_valid_prefix() {
assert_eq!(
strip_subset_prefix("ABCDEF+Helvetica-Bold"),
"Helvetica-Bold"
);
}
#[test]
fn strip_subset_prefix_strips_any_six_uppercase_letters() {
assert_eq!(strip_subset_prefix("ZZZZZZ+TimesNewRoman"), "TimesNewRoman");
}
#[test]
fn strip_subset_prefix_leaves_non_prefixed_name() {
assert_eq!(strip_subset_prefix("Helvetica"), "Helvetica");
}
#[test]
fn strip_subset_prefix_leaves_short_names() {
assert_eq!(strip_subset_prefix("AB+X"), "AB+X");
}
#[test]
fn strip_subset_prefix_leaves_lowercase_prefix() {
assert_eq!(strip_subset_prefix("abcdef+Font"), "abcdef+Font");
}
#[test]
fn strip_subset_prefix_leaves_mixed_case_prefix() {
assert_eq!(strip_subset_prefix("ABCDEf+Font"), "ABCDEf+Font");
}
#[test]
fn strip_subset_prefix_leaves_empty_string() {
assert_eq!(strip_subset_prefix(""), "");
}
}