use super::*;
use lopdf::Object;
#[test]
fn parse_to_unicode_cmap_basic() {
let cmap = b"/CIDInit /ProcSet findresource begin\n\
12 dict begin\n\
begincmap\n\
1 beginbfchar\n\
<0001> <65E5>\n\
endbfchar\n\
endcmap\n\
end\nend\n";
let map = parse_to_unicode_cmap(cmap);
assert_eq!(map.get(&1u16), Some(&'日'));
}
#[test]
fn parse_to_unicode_cmap_surrogate() {
let cmap = b"1 beginbfchar\n<0001> <D840DC00>\nendbfchar\n";
let map = parse_to_unicode_cmap(cmap);
assert_eq!(map.get(&1u16), Some(&'\u{20000}'));
}
#[test]
fn parse_bfrange_contiguous() {
let cmap = b"1 beginbfrange\n<20> <7E> <0020>\nendbfrange\n";
let map = parse_to_unicode_cmap(cmap);
assert_eq!(map.get(&0x20), Some(&' '));
assert_eq!(map.get(&0x41), Some(&'A'));
assert_eq!(map.get(&0x7E), Some(&'~'));
}
#[test]
fn parse_bfrange_explicit_array() {
let cmap = b"1 beginbfrange\n<20> <21> [<0048> <0069>]\nendbfrange\n";
let map = parse_to_unicode_cmap(cmap);
assert_eq!(map.get(&0x20), Some(&'H'));
assert_eq!(map.get(&0x21), Some(&'i'));
}
#[test]
fn decode_hex_bytes_roundtrip() {
let hex = b"00010002";
let bytes = decode_hex_bytes(hex);
assert_eq!(bytes, vec![0x00, 0x01, 0x00, 0x02]);
}
#[test]
fn litstr_tokenizer_basic() {
let stream = b"(Hello)";
let tokens = tokenize(stream);
assert!(matches!(&tokens[0].0, Token::LitStr(b) if b == b"Hello"));
}
#[test]
fn litstr_escapes() {
let stream = b"(He\\nllo\\041)"; let tokens = tokenize(stream);
match &tokens[0].0 {
Token::LitStr(b) => {
assert_eq!(b[0], b'H');
assert_eq!(b[1], b'e');
assert_eq!(b[2], b'\n');
assert_eq!(b[3], b'l');
assert_eq!(b[6], b'!');
}
_ => panic!("expected LitStr"),
}
}
#[test]
fn litstr_in_array() {
let stream = b"[(Hel) -50 (lo)]";
let tokens = tokenize(stream);
if let Token::Array(items) = &tokens[0].0 {
assert!(matches!(&items[0], Token::LitStr(b) if b == b"Hel"));
assert!(matches!(&items[1], Token::Number(n) if (*n + 50.0).abs() < 0.1));
assert!(matches!(&items[2], Token::LitStr(b) if b == b"lo"));
} else {
panic!("expected Array");
}
}
#[test]
fn tokenizer_smoke() {
let stream = b"BT\n/F0 12 Tf\n100 200 Td\n<0001> Tj\nET\n";
let tokens = tokenize(stream);
let keywords: Vec<&[u8]> = tokens
.iter()
.filter_map(|(t, _)| {
if let Token::Keyword(k) = t {
Some(k.as_slice())
} else {
None
}
})
.collect();
assert!(keywords.contains(&b"BT".as_slice()));
assert!(keywords.contains(&b"Tf".as_slice()));
assert!(keywords.contains(&b"Td".as_slice()));
assert!(keywords.contains(&b"Tj".as_slice()));
assert!(keywords.contains(&b"ET".as_slice()));
}
#[test]
fn parse_w_array_run_format() {
let arr = vec![
Object::Integer(0),
Object::Array(vec![
Object::Integer(500),
Object::Integer(600),
Object::Integer(700),
]),
];
let runs = parse_w_array(&arr);
assert_eq!(runs.len(), 1);
assert_eq!(runs[0].start_gid, 0);
assert_eq!(runs[0].widths, vec![500, 600, 700]);
}
#[test]
fn font_info_advance_width_fallback() {
let info = FontInfo {
to_unicode: BTreeMap::new(),
dw: 1000,
w_runs: vec![WidthRun {
start_gid: 5,
widths: vec![600],
}],
bytes_per_char: 2,
identity_fallback: false,
base_font: String::new(),
is_bold: false,
is_italic: false,
font_family: String::new(),
};
assert_eq!(info.advance_width(5), 600);
assert_eq!(info.advance_width(0), 1000);
assert_eq!(info.advance_width(99), 1000);
}
#[test]
fn win_ansi_spot_checks() {
assert_eq!(WIN_ANSI_ENCODING[0x20], Some(' '));
assert_eq!(WIN_ANSI_ENCODING[0x41], Some('A'));
assert_eq!(WIN_ANSI_ENCODING[0x80], Some('€'));
assert_eq!(WIN_ANSI_ENCODING[0xE9], Some('é'));
assert_eq!(WIN_ANSI_ENCODING[0x7F], None);
}
#[test]
fn agl_table_sorted() {
for i in 1..AGL_TABLE.len() {
assert!(
AGL_TABLE[i - 1].0 < AGL_TABLE[i].0,
"AGL_TABLE not sorted at index {i}: {:?} >= {:?}",
AGL_TABLE[i - 1].0,
AGL_TABLE[i].0
);
}
}
#[test]
fn glyph_name_lookup_spot_checks() {
assert_eq!(glyph_name_to_char(b"space"), Some(' '));
assert_eq!(glyph_name_to_char(b"eacute"), Some('é'));
assert_eq!(glyph_name_to_char(b"euro"), Some('€'));
assert_eq!(glyph_name_to_char(b"Euro"), Some('€'));
assert_eq!(glyph_name_to_char(b"fi"), Some('\u{FB01}'));
assert_eq!(glyph_name_to_char(b"nonexistent"), None);
}
#[test]
fn encoding_table_to_btree_basic() {
let map = encoding_table_to_btree(&WIN_ANSI_ENCODING);
assert_eq!(map.get(&0x41), Some(&'A'));
assert_eq!(map.get(&0x80), Some(&'€'));
assert!(!map.contains_key(&0x7F)); }
#[test]
fn parse_font_attributes_cases() {
let (name, bold, italic, family) = parse_font_attributes("Helvetica");
assert_eq!(name, "Helvetica");
assert!(!bold);
assert!(!italic);
assert_eq!(family, "Helvetica");
let (name, bold, italic, family) = parse_font_attributes("ABCDEF+Helvetica-Bold");
assert_eq!(name, "Helvetica-Bold");
assert!(bold);
assert!(!italic);
assert_eq!(family, "Helvetica");
let (name, bold, italic, family) = parse_font_attributes("TimesNewRoman-BoldItalic");
assert_eq!(name, "TimesNewRoman-BoldItalic");
assert!(bold);
assert!(italic);
assert_eq!(family, "TimesNewRoman");
let (_name, bold, italic, _family) = parse_font_attributes("Arial-Oblique");
assert!(!bold);
assert!(italic);
let (_name, bold, _italic, _family) = parse_font_attributes("Futura-Heavy");
assert!(bold);
}
#[test]
fn detect_text_columns_single() {
let frags = vec![TextFragment {
text: "Hello".into(),
x: 50.0,
y: 700.0,
width: 100.0,
height: 12.0,
font_size: 12.0,
font_name: "F1".into(),
color: [0.0; 3],
invisible: false,
is_bold: false,
is_italic: false,
font_family: String::new(),
base_font: String::new(),
space_advance: 0.0,
tf_font_size: 12.0,
tm_y_scale: 1.0,
source_stream: None,
source_op_start: None,
source_op_end: None,
source_xobject: None,
tm_origin_x: None,
tm_origin_y: None,
tm_x_scale: None,
tm_lm_x: None,
tm_lm_y: None,
}];
let zones = detect_text_columns(&frags, 595.0);
assert_eq!(zones.len(), 1);
assert!(detect_text_columns(&[], 595.0).is_empty());
}
#[test]
fn detect_text_columns_two_columns() {
let left = TextFragment {
text: "Left".into(),
x: 50.0,
y: 700.0,
width: 150.0,
height: 12.0,
font_size: 12.0,
font_name: "F1".into(),
color: [0.0; 3],
invisible: false,
is_bold: false,
is_italic: false,
font_family: String::new(),
base_font: String::new(),
space_advance: 0.0,
tf_font_size: 12.0,
tm_y_scale: 1.0,
source_stream: None,
source_op_start: None,
source_op_end: None,
source_xobject: None,
tm_origin_x: None,
tm_origin_y: None,
tm_x_scale: None,
tm_lm_x: None,
tm_lm_y: None,
};
let right = TextFragment {
text: "Right".into(),
x: 350.0,
y: 700.0,
width: 150.0,
height: 12.0,
font_size: 12.0,
font_name: "F1".into(),
color: [0.0; 3],
invisible: false,
is_bold: false,
is_italic: false,
font_family: String::new(),
base_font: String::new(),
space_advance: 0.0,
tf_font_size: 12.0,
tm_y_scale: 1.0,
source_stream: None,
source_op_start: None,
source_op_end: None,
source_xobject: None,
tm_origin_x: None,
tm_origin_y: None,
tm_x_scale: None,
tm_lm_x: None,
tm_lm_y: None,
};
let zones = detect_text_columns(&[left, right], 595.0);
assert_eq!(zones.len(), 2, "expected two columns, got {:?}", zones);
assert!(zones[0].x_start < zones[1].x_start);
}
fn make_frag(text: &str, x: f32, y: f32, w: f32, fs: f32) -> TextFragment {
TextFragment {
text: text.into(),
x,
y,
width: w,
height: fs,
font_size: fs,
font_name: "F1".into(),
color: [0.0; 3],
invisible: false,
is_bold: false,
is_italic: false,
font_family: String::new(),
base_font: String::new(),
space_advance: 0.0,
tf_font_size: fs,
tm_y_scale: 1.0,
source_stream: None,
source_op_start: None,
source_op_end: None,
source_xobject: None,
tm_origin_x: None,
tm_origin_y: None,
tm_x_scale: None,
tm_lm_x: None,
tm_lm_y: None,
}
}
#[test]
fn extract_table_cells_single_column() {
let frags = vec![
make_frag("Header", 50.0, 700.0, 80.0, 12.0),
make_frag("Row 1", 50.0, 680.0, 60.0, 12.0),
make_frag("Row 2", 50.0, 660.0, 60.0, 12.0),
];
let cells = extract_table_cells(&frags, 595.0, 842.0);
assert_eq!(cells.len(), 3);
assert_eq!(cells[0].row, 0); assert_eq!(cells[0].col, 0);
assert_eq!(cells[1].row, 1);
assert_eq!(cells[2].row, 2);
assert_eq!(cells[0].text, "Header");
}
#[test]
fn extract_table_cells_two_columns() {
let frags = vec![
make_frag("A1", 50.0, 700.0, 80.0, 12.0),
make_frag("B1", 300.0, 700.0, 80.0, 12.0),
make_frag("A2", 50.0, 680.0, 80.0, 12.0),
make_frag("B2", 300.0, 680.0, 80.0, 12.0),
];
let cells = extract_table_cells(&frags, 595.0, 842.0);
assert_eq!(cells.len(), 4);
let a1 = cells.iter().find(|c| c.row == 0 && c.col == 0).unwrap();
assert_eq!(a1.text, "A1");
let b1 = cells.iter().find(|c| c.row == 0 && c.col == 1).unwrap();
assert_eq!(b1.text, "B1");
}
#[test]
fn extract_table_cells_merges_same_cell_fragments() {
let frags = vec![
make_frag("Hello", 50.0, 700.0, 30.0, 12.0),
make_frag("World", 85.0, 700.0, 30.0, 12.0),
];
let cells = extract_table_cells(&frags, 595.0, 842.0);
assert_eq!(cells.len(), 1);
assert!(cells[0].text.contains("Hello"));
assert!(cells[0].text.contains("World"));
}
#[test]
fn extract_table_cells_empty_returns_empty() {
assert!(extract_table_cells(&[], 595.0, 842.0).is_empty());
assert!(extract_table_cells(&[], 0.0, 842.0).is_empty());
}
#[test]
fn group_text_fragments_raw() {
let frags = vec![
make_frag("A", 50.0, 700.0, 20.0, 12.0),
make_frag("B", 80.0, 700.0, 20.0, 12.0),
];
let groups = group_text_fragments(&frags, GroupingStrategy::Raw);
assert_eq!(groups.len(), 2);
}
#[test]
fn group_text_fragments_line() {
let frags = vec![
make_frag("A", 50.0, 700.0, 20.0, 12.0),
make_frag("B", 80.0, 700.0, 20.0, 12.0), make_frag("C", 50.0, 680.0, 20.0, 12.0), ];
let groups = group_text_fragments(&frags, GroupingStrategy::Line);
assert_eq!(groups.len(), 2, "expected 2 lines, got {}", groups.len());
assert!(groups[0].text.contains('A') && groups[0].text.contains('B'));
}
#[test]
fn group_text_fragments_paragraph() {
let frags = vec![
make_frag("L1", 50.0, 700.0, 20.0, 12.0),
make_frag("L2", 50.0, 686.0, 20.0, 12.0), make_frag("L3", 50.0, 630.0, 20.0, 12.0), ];
let groups = group_text_fragments(&frags, GroupingStrategy::Paragraph);
assert_eq!(groups.len(), 2, "expected 2 paragraphs, got {}", groups.len());
assert!(groups[0].text.contains("L1") && groups[0].text.contains("L2"));
assert!(groups[1].text.contains("L3"));
}
#[test]
fn extract_xobjects_from_inherited_resources() {
use lopdf::{Document, Stream};
let mut doc = Document::new();
let mut font_d = Dictionary::new();
font_d.set("Type", Object::Name(b"Font".to_vec()));
font_d.set("Subtype", Object::Name(b"Type1".to_vec()));
font_d.set("BaseFont", Object::Name(b"Helvetica".to_vec()));
let font_id = doc.add_object(Object::Dictionary(font_d));
let mut xobj_font_d = Dictionary::new();
xobj_font_d.set("F1", Object::Reference(font_id));
let mut xobj_res = Dictionary::new();
xobj_res.set("Font", Object::Dictionary(xobj_font_d));
let mut xobj_d = Dictionary::new();
xobj_d.set("Type", Object::Name(b"XObject".to_vec()));
xobj_d.set("Subtype", Object::Name(b"Form".to_vec()));
xobj_d.set(
"BBox",
Object::Array(vec![
Object::Integer(0),
Object::Integer(0),
Object::Integer(595),
Object::Integer(842),
]),
);
xobj_d.set("Resources", Object::Dictionary(xobj_res));
let xobj_id = doc.add_object(Object::Stream(Stream::new(
xobj_d,
b"BT /F1 12 Tf (Hello) Tj ET".to_vec(),
)));
let content_id = doc.add_object(Object::Stream(Stream::new(
Dictionary::new(),
b"q Q".to_vec(),
)));
let mut page_d = Dictionary::new();
page_d.set("Type", Object::Name(b"Page".to_vec()));
page_d.set(
"MediaBox",
Object::Array(vec![
Object::Integer(0),
Object::Integer(0),
Object::Integer(595),
Object::Integer(842),
]),
);
page_d.set("Contents", Object::Reference(content_id));
let page_id = doc.add_object(Object::Dictionary(page_d));
let mut xobj_dict = Dictionary::new();
xobj_dict.set("X1", Object::Reference(xobj_id));
let mut pages_res = Dictionary::new();
pages_res.set("XObject", Object::Dictionary(xobj_dict));
let mut pages_d = Dictionary::new();
pages_d.set("Type", Object::Name(b"Pages".to_vec()));
pages_d.set("Kids", Object::Array(vec![Object::Reference(page_id)]));
pages_d.set("Count", Object::Integer(1));
pages_d.set("Resources", Object::Dictionary(pages_res));
let pages_id = doc.add_object(Object::Dictionary(pages_d));
if let Ok(obj) = doc.get_object_mut(page_id) {
if let Ok(d) = obj.as_dict_mut() {
d.set("Parent", Object::Reference(pages_id));
}
}
let mut catalog = Dictionary::new();
catalog.set("Type", Object::Name(b"Catalog".to_vec()));
catalog.set("Pages", Object::Reference(pages_id));
let catalog_id = doc.add_object(Object::Dictionary(catalog));
doc.trailer.set("Root", Object::Reference(catalog_id));
let frags = extract_text_runs_from_page(&doc, page_id).unwrap();
let text: String = frags.iter().map(|f| f.text.as_str()).collect::<Vec<_>>().join("");
assert!(
!frags.is_empty(),
"expected text from XObject with inherited /Resources, got none"
);
assert!(
text.contains("Hello"),
"expected 'Hello' in extracted text, got: {text:?}"
);
}
#[test]
fn extract_cid_xobject_inherited_resources() {
use lopdf::{Document, Stream};
let cmap_bytes = b"/CIDInit /ProcSet findresource begin\n\
12 dict begin\n\
begincmap\n\
/CIDSystemInfo << /Registry (Adobe) /Ordering (Identity) /Supplement 0 >> def\n\
/CMapName /Adobe-Identity-H def\n\
/CMapType 1 def\n\
2 beginbfchar\n\
<0048> <0048>\n\
<0069> <0069>\n\
endbfchar\n\
endcmap\n\
end end\n"
.to_vec();
let mut doc = Document::new();
let cmap_id = doc.add_object(Object::Stream(Stream::new(Dictionary::new(), cmap_bytes)));
let mut cidfont_d = Dictionary::new();
cidfont_d.set("Type", Object::Name(b"Font".to_vec()));
cidfont_d.set("Subtype", Object::Name(b"CIDFontType2".to_vec()));
cidfont_d.set("BaseFont", Object::Name(b"TestCIDFont".to_vec()));
{
let mut cidsys = Dictionary::new();
cidsys.set("Registry", Object::String(b"Adobe".to_vec(), lopdf::StringFormat::Literal));
cidsys.set("Ordering", Object::String(b"Identity".to_vec(), lopdf::StringFormat::Literal));
cidsys.set("Supplement", Object::Integer(0));
cidfont_d.set("CIDSystemInfo", Object::Dictionary(cidsys));
}
cidfont_d.set("DW", Object::Integer(1000));
let cidfont_id = doc.add_object(Object::Dictionary(cidfont_d));
let mut font_d = Dictionary::new();
font_d.set("Type", Object::Name(b"Font".to_vec()));
font_d.set("Subtype", Object::Name(b"Type0".to_vec()));
font_d.set("BaseFont", Object::Name(b"TestCIDFont".to_vec()));
font_d.set("Encoding", Object::Name(b"Identity-H".to_vec()));
font_d.set("DescendantFonts", Object::Array(vec![Object::Reference(cidfont_id)]));
font_d.set("ToUnicode", Object::Reference(cmap_id));
let font_id = doc.add_object(Object::Dictionary(font_d));
let mut xobj_font_d = Dictionary::new();
xobj_font_d.set("F1", Object::Reference(font_id));
let mut xobj_res = Dictionary::new();
xobj_res.set("Font", Object::Dictionary(xobj_font_d));
let mut xobj_d = Dictionary::new();
xobj_d.set("Type", Object::Name(b"XObject".to_vec()));
xobj_d.set("Subtype", Object::Name(b"Form".to_vec()));
xobj_d.set(
"BBox",
Object::Array(vec![
Object::Integer(0), Object::Integer(0),
Object::Integer(595), Object::Integer(842),
]),
);
xobj_d.set("Resources", Object::Dictionary(xobj_res));
let xobj_id = doc.add_object(Object::Stream(Stream::new(
xobj_d,
b"BT /F1 12 Tf <00480069> Tj ET".to_vec(),
)));
let content_id = doc.add_object(Object::Stream(Stream::new(
Dictionary::new(),
b"q Q".to_vec(),
)));
let mut page_d = Dictionary::new();
page_d.set("Type", Object::Name(b"Page".to_vec()));
page_d.set(
"MediaBox",
Object::Array(vec![
Object::Integer(0), Object::Integer(0),
Object::Integer(595), Object::Integer(842),
]),
);
page_d.set("Contents", Object::Reference(content_id));
let page_id = doc.add_object(Object::Dictionary(page_d));
let mut xobj_dict = Dictionary::new();
xobj_dict.set("X1", Object::Reference(xobj_id));
let mut pages_res = Dictionary::new();
pages_res.set("XObject", Object::Dictionary(xobj_dict));
let mut pages_d = Dictionary::new();
pages_d.set("Type", Object::Name(b"Pages".to_vec()));
pages_d.set("Kids", Object::Array(vec![Object::Reference(page_id)]));
pages_d.set("Count", Object::Integer(1));
pages_d.set("Resources", Object::Dictionary(pages_res));
let pages_id = doc.add_object(Object::Dictionary(pages_d));
if let Ok(obj) = doc.get_object_mut(page_id) {
if let Ok(d) = obj.as_dict_mut() {
d.set("Parent", Object::Reference(pages_id));
}
}
let mut catalog = Dictionary::new();
catalog.set("Type", Object::Name(b"Catalog".to_vec()));
catalog.set("Pages", Object::Reference(pages_id));
let catalog_id = doc.add_object(Object::Dictionary(catalog));
doc.trailer.set("Root", Object::Reference(catalog_id));
let frags = extract_text_runs_from_page(&doc, page_id).unwrap();
let text: String = frags.iter().map(|f| f.text.as_str()).collect::<Vec<_>>().join("");
assert!(
!frags.is_empty(),
"expected CID text from XObject with inherited /Resources, got none"
);
assert!(
text.contains("Hi"),
"expected 'Hi' from CID+hex decode, got: {text:?}"
);
}
#[test]
fn ctm_transforms_coordinates_to_page_space() {
let mut to_unicode = BTreeMap::new();
to_unicode.insert(0x41u16, 'A');
let mut fonts = HashMap::new();
fonts.insert(b"F1".to_vec(), FontInfo {
to_unicode,
dw: 1000,
w_runs: vec![WidthRun { start_gid: 0x41, widths: vec![600] }],
bytes_per_char: 1,
identity_fallback: false,
is_bold: false,
is_italic: false,
font_family: String::new(),
base_font: String::new(),
});
let stream = b"q\n\
0.24 0 0 -0.24 0 841 cm\n\
BT\n\
/F1 100 Tf\n\
100 200 Td\n\
(A) Tj\n\
ET\n\
Q\n";
let mut state = ParseCarryState::default();
let mut frags: Vec<TextFragment> = Vec::new();
parse_content_stream(stream, &fonts, &mut state, &mut frags, Some(0), None);
assert_eq!(frags.len(), 1, "expected one TextFragment");
let f = &frags[0];
let eps = 0.5;
assert!(
(f.x - 24.0).abs() < eps,
"x should be ~24 (0.24*100), got {}",
f.x
);
assert!(
(f.y - 793.0).abs() < eps,
"y should be ~793 (-0.24*200 + 841), got {}",
f.y
);
assert!(
(f.font_size - 24.0).abs() < eps,
"font_size should be ~24 (100*0.24), got {}",
f.font_size
);
}
#[test]
fn ctm_at_do_captured_in_state() {
let fonts: HashMap<Vec<u8>, FontInfo> = HashMap::new();
let stream = b"q\n0.24 0 0 -0.24 0 841 cm\n/Fm0 Do\nQ\n";
let mut state = ParseCarryState::default();
let mut frags: Vec<TextFragment> = Vec::new();
parse_content_stream(stream, &fonts, &mut state, &mut frags, Some(0), None);
let eps = 1e-5f32;
assert!((state.ctm[0] - 0.24).abs() < eps, "ctm[0] should be 0.24, got {}", state.ctm[0]);
assert!((state.ctm[3] - -0.24).abs() < eps, "ctm[3] should be -0.24, got {}", state.ctm[3]);
assert!((state.ctm[5] - 841.0).abs() < eps, "ctm[5] should be 841, got {}", state.ctm[5]);
}
#[test]
fn extract_xobject_font_from_page_resources() {
use lopdf::{Document, Stream};
let mut doc = Document::new();
let mut font_d = Dictionary::new();
font_d.set("Type", Object::Name(b"Font".to_vec()));
font_d.set("Subtype", Object::Name(b"Type1".to_vec()));
font_d.set("BaseFont", Object::Name(b"Helvetica".to_vec()));
let font_id = doc.add_object(Object::Dictionary(font_d));
let mut xobj_res = Dictionary::new();
xobj_res.set(
"ProcSet",
Object::Array(vec![
Object::Name(b"PDF".to_vec()),
Object::Name(b"Text".to_vec()),
]),
);
let mut xobj_d = Dictionary::new();
xobj_d.set("Type", Object::Name(b"XObject".to_vec()));
xobj_d.set("Subtype", Object::Name(b"Form".to_vec()));
xobj_d.set(
"BBox",
Object::Array(vec![
Object::Integer(0),
Object::Integer(0),
Object::Integer(595),
Object::Integer(842),
]),
);
xobj_d.set("Resources", Object::Dictionary(xobj_res));
let xobj_id = doc.add_object(Object::Stream(Stream::new(
xobj_d,
b"BT /F1 12 Tf (Hello) Tj ET".to_vec(),
)));
let content_id = doc.add_object(Object::Stream(Stream::new(
Dictionary::new(),
b"/X1 Do".to_vec(),
)));
let mut font_dict = Dictionary::new();
font_dict.set("F1", Object::Reference(font_id));
let mut xobj_dict = Dictionary::new();
xobj_dict.set("X1", Object::Reference(xobj_id));
let mut page_res = Dictionary::new();
page_res.set("Font", Object::Dictionary(font_dict));
page_res.set("XObject", Object::Dictionary(xobj_dict));
let mut page_d = Dictionary::new();
page_d.set("Type", Object::Name(b"Page".to_vec()));
page_d.set(
"MediaBox",
Object::Array(vec![
Object::Integer(0),
Object::Integer(0),
Object::Integer(595),
Object::Integer(842),
]),
);
page_d.set("Resources", Object::Dictionary(page_res));
page_d.set("Contents", Object::Reference(content_id));
let page_id = doc.add_object(Object::Dictionary(page_d));
let mut pages_d = Dictionary::new();
pages_d.set("Type", Object::Name(b"Pages".to_vec()));
pages_d.set("Kids", Object::Array(vec![Object::Reference(page_id)]));
pages_d.set("Count", Object::Integer(1));
let pages_id = doc.add_object(Object::Dictionary(pages_d));
if let Ok(obj) = doc.get_object_mut(page_id) {
if let Ok(d) = obj.as_dict_mut() {
d.set("Parent", Object::Reference(pages_id));
}
}
let mut catalog = Dictionary::new();
catalog.set("Type", Object::Name(b"Catalog".to_vec()));
catalog.set("Pages", Object::Reference(pages_id));
let catalog_id = doc.add_object(Object::Dictionary(catalog));
doc.trailer.set("Root", Object::Reference(catalog_id));
let frags = extract_text_runs_from_page(&doc, page_id).unwrap();
let text: String = frags.iter().map(|f| f.text.as_str()).collect::<Vec<_>>().join("");
assert!(
text.contains("Hello"),
"XObject referencing page-level font must extract text; got: {text:?}"
);
}
#[test]
fn extract_cross_stream_bt_tj() {
use lopdf::{Document, Stream};
let mut doc = Document::new();
let mut font_d = Dictionary::new();
font_d.set("Type", Object::Name(b"Font".to_vec()));
font_d.set("Subtype", Object::Name(b"Type1".to_vec()));
font_d.set("BaseFont", Object::Name(b"Helvetica".to_vec()));
let font_id = doc.add_object(Object::Dictionary(font_d));
let stream_a_id = doc.add_object(Object::Stream(Stream::new(
Dictionary::new(),
b"BT /F1 12 Tf 100 700 Td".to_vec(),
)));
let stream_b_id = doc.add_object(Object::Stream(Stream::new(
Dictionary::new(),
b"(Hello) Tj ET".to_vec(),
)));
let mut font_dict = Dictionary::new();
font_dict.set("F1", Object::Reference(font_id));
let mut page_res = Dictionary::new();
page_res.set("Font", Object::Dictionary(font_dict));
let mut page_d = Dictionary::new();
page_d.set("Type", Object::Name(b"Page".to_vec()));
page_d.set(
"MediaBox",
Object::Array(vec![
Object::Integer(0),
Object::Integer(0),
Object::Integer(595),
Object::Integer(842),
]),
);
page_d.set("Resources", Object::Dictionary(page_res));
page_d.set(
"Contents",
Object::Array(vec![
Object::Reference(stream_a_id),
Object::Reference(stream_b_id),
]),
);
let page_id = doc.add_object(Object::Dictionary(page_d));
let mut pages_d = Dictionary::new();
pages_d.set("Type", Object::Name(b"Pages".to_vec()));
pages_d.set("Kids", Object::Array(vec![Object::Reference(page_id)]));
pages_d.set("Count", Object::Integer(1));
let pages_id = doc.add_object(Object::Dictionary(pages_d));
if let Ok(obj) = doc.get_object_mut(page_id) {
if let Ok(d) = obj.as_dict_mut() {
d.set("Parent", Object::Reference(pages_id));
}
}
let mut catalog = Dictionary::new();
catalog.set("Type", Object::Name(b"Catalog".to_vec()));
catalog.set("Pages", Object::Reference(pages_id));
let catalog_id = doc.add_object(Object::Dictionary(catalog));
doc.trailer.set("Root", Object::Reference(catalog_id));
let frags = extract_text_runs_from_page(&doc, page_id).unwrap();
let text: String = frags.iter().map(|f| f.text.as_str()).collect::<Vec<_>>().join("");
assert!(
text.contains("Hello"),
"text inside BT split across streams must be extracted; got: {text:?}"
);
}
#[test]
fn text_fragment_bounds_empty() {
assert!(text_fragment_bounds(&[]).is_none());
}
#[test]
fn text_fragment_bounds_single() {
let frag = make_frag("A", 100.0, 700.0, 50.0, 12.0);
let [x, y, w, h] = text_fragment_bounds(&[frag]).unwrap();
let eps = 0.01;
assert!((x - 100.0).abs() < eps, "x={x}");
assert!((y - (700.0 - 12.0 * 0.25)).abs() < eps, "y={y}");
assert!((w - 50.0).abs() < eps, "w={w}");
assert!((h - 12.0).abs() < eps, "h={h}"); }
#[test]
fn text_fragment_bounds_multiple() {
let a = make_frag("A", 50.0, 700.0, 40.0, 12.0);
let b = make_frag("B", 200.0, 680.0, 60.0, 14.0);
let [x, y, w, h] = text_fragment_bounds(&[a, b]).unwrap();
let eps = 0.01;
assert!((x - 50.0).abs() < eps, "x={x}");
assert!((w - 210.0).abs() < eps, "w={w}");
let expected_y_min = f32::min(700.0 - 12.0 * 0.25, 680.0 - 14.0 * 0.25);
assert!((y - expected_y_min).abs() < eps, "y={y}");
let expected_y_max = f32::max(700.0 + 12.0 * 0.75, 680.0 + 14.0 * 0.75);
assert!((h - (expected_y_max - expected_y_min)).abs() < eps, "h={h}");
}
#[test]
fn tm_origin_preserves_column_anchor() {
use lopdf::{Document, Stream};
let mut doc = Document::new();
let mut font_d = Dictionary::new();
font_d.set("Type", Object::Name(b"Font".to_vec()));
font_d.set("Subtype", Object::Name(b"Type1".to_vec()));
font_d.set("BaseFont", Object::Name(b"Helvetica".to_vec()));
let font_id = doc.add_object(Object::Dictionary(font_d));
let stream_bytes = b"BT /F1 12 Tf 100 700 Td (AB) Tj 0 -14 Td (C) Tj ET".to_vec();
let content_id =
doc.add_object(Object::Stream(Stream::new(Dictionary::new(), stream_bytes)));
let mut font_dict = Dictionary::new();
font_dict.set("F1", Object::Reference(font_id));
let mut page_res = Dictionary::new();
page_res.set("Font", Object::Dictionary(font_dict));
let mut page_d = Dictionary::new();
page_d.set("Type", Object::Name(b"Page".to_vec()));
page_d.set("MediaBox", Object::Array(vec![
Object::Integer(0), Object::Integer(0),
Object::Integer(595), Object::Integer(842),
]));
page_d.set("Resources", Object::Dictionary(page_res));
page_d.set("Contents", Object::Reference(content_id));
let page_id = doc.add_object(Object::Dictionary(page_d));
let mut pages_d = Dictionary::new();
pages_d.set("Type", Object::Name(b"Pages".to_vec()));
pages_d.set("Kids", Object::Array(vec![Object::Reference(page_id)]));
pages_d.set("Count", Object::Integer(1));
let pages_id = doc.add_object(Object::Dictionary(pages_d));
if let Ok(obj) = doc.get_object_mut(page_id) && let Ok(d) = obj.as_dict_mut() {
d.set("Parent", Object::Reference(pages_id));
}
let mut catalog = Dictionary::new();
catalog.set("Type", Object::Name(b"Catalog".to_vec()));
catalog.set("Pages", Object::Reference(pages_id));
let catalog_id = doc.add_object(Object::Dictionary(catalog));
doc.trailer.set("Root", Object::Reference(catalog_id));
let frags = extract_text_runs_from_page(&doc, page_id).unwrap();
assert!(!frags.is_empty(), "expected fragments");
for f in &frags {
assert!(
f.tm_origin_x.is_none(),
"no Tm in stream → tm_origin_x should be None, got {:?}",
f.tm_origin_x
);
}
let mut doc2 = Document::new();
let font_id2 = doc2.add_object(Object::Dictionary({
let mut d = lopdf::Dictionary::new();
d.set("Type", Object::Name(b"Font".to_vec()));
d.set("Subtype", Object::Name(b"Type1".to_vec()));
d.set("BaseFont", Object::Name(b"Helvetica".to_vec()));
d
}));
let stream2 = b"BT /F1 12 Tf 1 0 0 1 100 700 Tm (AB) Tj 0 -14 Td (C) Tj ET".to_vec();
let cid2 = doc2.add_object(Object::Stream(Stream::new(Dictionary::new(), stream2)));
let mut fd2 = lopdf::Dictionary::new();
fd2.set("F1", Object::Reference(font_id2));
let mut pr2 = lopdf::Dictionary::new();
pr2.set("Font", Object::Dictionary(fd2));
let mut pg2 = lopdf::Dictionary::new();
pg2.set("Type", Object::Name(b"Page".to_vec()));
pg2.set("MediaBox", Object::Array(vec![
Object::Integer(0), Object::Integer(0),
Object::Integer(595), Object::Integer(842),
]));
pg2.set("Resources", Object::Dictionary(pr2));
pg2.set("Contents", Object::Reference(cid2));
let page_id2 = doc2.add_object(Object::Dictionary(pg2));
let mut ps2 = lopdf::Dictionary::new();
ps2.set("Type", Object::Name(b"Pages".to_vec()));
ps2.set("Kids", Object::Array(vec![Object::Reference(page_id2)]));
ps2.set("Count", Object::Integer(1));
let pages_id2 = doc2.add_object(Object::Dictionary(ps2));
if let Ok(obj) = doc2.get_object_mut(page_id2) && let Ok(d) = obj.as_dict_mut() {
d.set("Parent", Object::Reference(pages_id2));
}
let mut cat2 = lopdf::Dictionary::new();
cat2.set("Type", Object::Name(b"Catalog".to_vec()));
cat2.set("Pages", Object::Reference(pages_id2));
let cat_id2 = doc2.add_object(Object::Dictionary(cat2));
doc2.trailer.set("Root", Object::Reference(cat_id2));
let frags2 = extract_text_runs_from_page(&doc2, page_id2).unwrap();
assert!(!frags2.is_empty(), "expected fragments with Tm");
for f in &frags2 {
let ox = f.tm_origin_x.unwrap_or(f32::NAN);
assert!(
(ox - 100.0).abs() < 0.5,
"tm_origin_x should be ~100, got {ox} for {:?}",
f.text
);
}
let rows: Vec<_> = frags2.iter().collect();
if rows.len() >= 2 {
let second_row_x = rows[rows.len() - 1].x;
assert!(
(second_row_x - 100.0).abs() < 1.0,
"after Td(0,-14) x should reset to T_lm_x=100, got {second_row_x}"
);
let second_lm_x = rows[rows.len() - 1].tm_lm_x.unwrap_or(f32::NAN);
assert!(
(second_lm_x - 100.0).abs() < 0.5,
"second row tm_lm_x should be 100, got {second_lm_x}"
);
}
}
#[test]
fn tm_x_scale_and_td_scaling() {
use lopdf::{Document, Stream};
fn make_doc(stream_bytes: Vec<u8>) -> (Document, lopdf::ObjectId) {
let mut doc = Document::new();
let mut fd = lopdf::Dictionary::new();
fd.set("Type", Object::Name(b"Font".to_vec()));
fd.set("Subtype", Object::Name(b"Type1".to_vec()));
fd.set("BaseFont", Object::Name(b"Helvetica".to_vec()));
let font_id = doc.add_object(Object::Dictionary(fd));
let cid = doc.add_object(Object::Stream(Stream::new(
lopdf::Dictionary::new(),
stream_bytes,
)));
let mut font_dict = lopdf::Dictionary::new();
font_dict.set("F1", Object::Reference(font_id));
let mut res = lopdf::Dictionary::new();
res.set("Font", Object::Dictionary(font_dict));
let mut pg = lopdf::Dictionary::new();
pg.set("Type", Object::Name(b"Page".to_vec()));
pg.set("MediaBox", Object::Array(vec![
Object::Integer(0), Object::Integer(0),
Object::Integer(595), Object::Integer(842),
]));
pg.set("Resources", Object::Dictionary(res));
pg.set("Contents", Object::Reference(cid));
let page_id = doc.add_object(Object::Dictionary(pg));
let mut ps = lopdf::Dictionary::new();
ps.set("Type", Object::Name(b"Pages".to_vec()));
ps.set("Kids", Object::Array(vec![Object::Reference(page_id)]));
ps.set("Count", Object::Integer(1));
let pages_id = doc.add_object(Object::Dictionary(ps));
if let Ok(obj) = doc.get_object_mut(page_id) && let Ok(d) = obj.as_dict_mut() {
d.set("Parent", Object::Reference(pages_id));
}
let mut cat = lopdf::Dictionary::new();
cat.set("Type", Object::Name(b"Catalog".to_vec()));
cat.set("Pages", Object::Reference(pages_id));
let cat_id = doc.add_object(Object::Dictionary(cat));
doc.trailer.set("Root", Object::Reference(cat_id));
(doc, page_id)
}
let eps = 1.0f32;
{
let stream = b"BT /F1 1 Tf 10 0 0 10 50 700 Tm (A) Tj 5 0 Td (B) Tj ET".to_vec();
let (doc, page_id) = make_doc(stream);
let frags = extract_text_runs_from_page(&doc, page_id).unwrap();
assert_eq!(frags.len(), 2, "expected 2 fragments (A and B)");
for f in &frags {
let xs = f.tm_x_scale.unwrap_or(f32::NAN);
assert!((xs - 10.0).abs() < 0.01, "tm_x_scale should be 10, got {xs}");
}
let x_b = frags[1].x;
let expected_x_b = 50.0 + 5.0 * 10.0; assert!(
(x_b - expected_x_b).abs() < eps,
"Td(5,0) resets T_m to T_lm_new=100; got x_B={x_b}, expected={expected_x_b}"
);
assert!((frags[0].tm_lm_x.unwrap_or(f32::NAN) - 50.0).abs() < 0.5,
"A.tm_lm_x should be 50 (Tm anchor), got {:?}", frags[0].tm_lm_x);
assert!((frags[1].tm_lm_x.unwrap_or(f32::NAN) - 100.0).abs() < 0.5,
"B.tm_lm_x should be 100 (after Td), got {:?}", frags[1].tm_lm_x);
}
{
let stream = b"BT /F1 1 Tf 5 0 0 2 100 700 Tm (A) Tj ET".to_vec();
let (doc, page_id) = make_doc(stream);
let frags = extract_text_runs_from_page(&doc, page_id).unwrap();
assert!(!frags.is_empty());
let f = &frags[0];
let xs = f.tm_x_scale.unwrap_or(f32::NAN);
assert!((xs - 5.0).abs() < 0.01, "tm_x_scale should be 5, got {xs}");
assert!((f.tm_y_scale - 2.0).abs() < 0.01, "tm_y_scale should be 2, got {}", f.tm_y_scale);
assert!((f.height - 2.0).abs() < 0.01, "height should be ≈2 (y_scale), got {}", f.height);
assert!(
f.width > f.height,
"width (x_scale=5 based) should exceed height (y_scale=2 based); w={} h={}",
f.width, f.height
);
}
{
let stream = b"BT /F1 12 Tf 100 700 Td (A) Tj ET".to_vec();
let (doc, page_id) = make_doc(stream);
let frags = extract_text_runs_from_page(&doc, page_id).unwrap();
assert!(!frags.is_empty());
for f in &frags {
assert!(
f.tm_x_scale.is_none(),
"no Tm → tm_x_scale should be None, got {:?}",
f.tm_x_scale
);
}
}
}
#[test]
fn form_pdf_column_stability() {
use lopdf::{Document, Stream};
let mut doc = Document::new();
let mut fd = lopdf::Dictionary::new();
fd.set("Type", Object::Name(b"Font".to_vec()));
fd.set("Subtype", Object::Name(b"Type1".to_vec()));
fd.set("BaseFont", Object::Name(b"Helvetica".to_vec()));
let font_id = doc.add_object(Object::Dictionary(fd));
let stream_bytes =
b"BT /F1 1 Tf 10 0 0 10 50 700 Tm \
(A) Tj 200 0 Td (B) Tj -200 -14 Td (C) Tj 200 0 Td (D) Tj ET"
.to_vec();
let cid = doc.add_object(Object::Stream(Stream::new(
lopdf::Dictionary::new(),
stream_bytes,
)));
let mut font_dict = lopdf::Dictionary::new();
font_dict.set("F1", Object::Reference(font_id));
let mut res = lopdf::Dictionary::new();
res.set("Font", Object::Dictionary(font_dict));
let mut pg = lopdf::Dictionary::new();
pg.set("Type", Object::Name(b"Page".to_vec()));
pg.set("MediaBox", Object::Array(vec![
Object::Integer(0), Object::Integer(0),
Object::Integer(595), Object::Integer(842),
]));
pg.set("Resources", Object::Dictionary(res));
pg.set("Contents", Object::Reference(cid));
let page_id = doc.add_object(Object::Dictionary(pg));
let mut ps = lopdf::Dictionary::new();
ps.set("Type", Object::Name(b"Pages".to_vec()));
ps.set("Kids", Object::Array(vec![Object::Reference(page_id)]));
ps.set("Count", Object::Integer(1));
let pages_id = doc.add_object(Object::Dictionary(ps));
if let Ok(obj) = doc.get_object_mut(page_id) && let Ok(d) = obj.as_dict_mut() {
d.set("Parent", Object::Reference(pages_id));
}
let mut cat = lopdf::Dictionary::new();
cat.set("Type", Object::Name(b"Catalog".to_vec()));
cat.set("Pages", Object::Reference(pages_id));
let cat_id = doc.add_object(Object::Dictionary(cat));
doc.trailer.set("Root", Object::Reference(cat_id));
let frags = extract_text_runs_from_page(&doc, page_id).unwrap();
assert_eq!(frags.len(), 4, "expected 4 fragments A/B/C/D");
let (a, b, c, d) = (&frags[0], &frags[1], &frags[2], &frags[3]);
let eps = 1.0f32;
for f in &frags {
let ox = f.tm_origin_x.unwrap_or(f32::NAN);
assert!((ox - 50.0).abs() < eps, "tm_origin_x should be 50, got {ox} for {:?}", f.text);
}
assert!((a.x - 50.0).abs() < eps, "A.x should be ≈50, got {}", a.x);
assert!((c.x - 50.0).abs() < eps, "C.x should be ≈50 after -200 Td, got {}", c.x);
let value_x = 200.0 * 10.0 + 50.0;
assert!((b.x - value_x).abs() < eps, "B.x should be ≈{value_x}, got {}", b.x);
assert!((d.x - value_x).abs() < eps, "D.x should be ≈{value_x}, got {}", d.x);
let a_lm = a.tm_lm_x.unwrap_or(f32::NAN);
let b_lm = b.tm_lm_x.unwrap_or(f32::NAN);
let c_lm = c.tm_lm_x.unwrap_or(f32::NAN);
let d_lm = d.tm_lm_x.unwrap_or(f32::NAN);
assert!((a_lm - 50.0).abs() < eps, "A.tm_lm_x should be 50, got {a_lm}");
assert!((b_lm - value_x).abs() < eps, "B.tm_lm_x should be {value_x}, got {b_lm}");
assert!((c_lm - 50.0).abs() < eps, "C.tm_lm_x should be 50 (row reset), got {c_lm}");
assert!((d_lm - value_x).abs() < eps, "D.tm_lm_x should be {value_x}, got {d_lm}");
}
#[test]
fn extract_table_cells_has_fragments_and_tm_lm_cols() {
use lopdf::{Document, Stream};
let stream_bytes =
b"BT /F1 1 Tf 10 0 0 10 50 700 Tm \
(Label) Tj 200 0 Td (Value) Tj ET"
.to_vec();
let mut doc = Document::new();
let mut fd = lopdf::Dictionary::new();
fd.set("Type", Object::Name(b"Font".to_vec()));
fd.set("Subtype", Object::Name(b"Type1".to_vec()));
fd.set("BaseFont", Object::Name(b"Helvetica".to_vec()));
let font_id = doc.add_object(Object::Dictionary(fd));
let cid = doc.add_object(Object::Stream(Stream::new(
lopdf::Dictionary::new(), stream_bytes,
)));
let mut font_dict = lopdf::Dictionary::new();
font_dict.set("F1", Object::Reference(font_id));
let mut res = lopdf::Dictionary::new();
res.set("Font", Object::Dictionary(font_dict));
let mut pg = lopdf::Dictionary::new();
pg.set("Type", Object::Name(b"Page".to_vec()));
pg.set("MediaBox", Object::Array(vec![
Object::Integer(0), Object::Integer(0),
Object::Integer(595), Object::Integer(842),
]));
pg.set("Resources", Object::Dictionary(res));
pg.set("Contents", Object::Reference(cid));
let page_id = doc.add_object(Object::Dictionary(pg));
let mut ps = lopdf::Dictionary::new();
ps.set("Type", Object::Name(b"Pages".to_vec()));
ps.set("Kids", Object::Array(vec![Object::Reference(page_id)]));
ps.set("Count", Object::Integer(1));
let pages_id = doc.add_object(Object::Dictionary(ps));
if let Ok(obj) = doc.get_object_mut(page_id) && let Ok(d) = obj.as_dict_mut() {
d.set("Parent", Object::Reference(pages_id));
}
let mut cat = lopdf::Dictionary::new();
cat.set("Type", Object::Name(b"Catalog".to_vec()));
cat.set("Pages", Object::Reference(pages_id));
let cat_id = doc.add_object(Object::Dictionary(cat));
doc.trailer.set("Root", Object::Reference(cat_id));
let frags = extract_text_runs_from_page(&doc, page_id).unwrap();
let cells = extract_table_cells(&frags, 595.0, 842.0);
assert_eq!(cells.len(), 2, "expected 2 cells (label col + value col), got {}", cells.len());
for c in &cells {
assert!(!c.fragments.is_empty(), "cell ({},{}) has no fragments", c.row, c.col);
}
let col0 = cells.iter().find(|c| c.col == 0).expect("col 0 missing");
let col1 = cells.iter().find(|c| c.col == 1).expect("col 1 missing");
assert!((col0.x - 50.0).abs() < 5.0, "col0.x should be ≈50, got {}", col0.x);
let value_x = 50.0 + 200.0 * 10.0;
assert!((col1.x - value_x).abs() < 5.0, "col1.x should be ≈{value_x}, got {}", col1.x);
assert_eq!(col0.text.trim(), "Label");
assert_eq!(col1.text.trim(), "Value");
let b = col0.bbox();
assert!((b[0] - col0.x).abs() < 0.01);
assert!((b[1] - col0.y).abs() < 0.01);
assert!((b[2] - col0.width).abs() < 0.01);
assert!((b[3] - col0.height).abs() < 0.01);
}