use lopdf::content::{Content, Operation};
use lopdf::{dictionary, Document, Object, Stream, StringFormat};
#[test]
fn unicode_can_be_written_to_pdf_and_read() {
let mut doc = Document::new();
let unicode_literal = "😀🔧🔨";
let literal_encoded_with_cmap = [0x00, 0x5F, 0x00, 0x60, 0x00, 0x61];
let pages_id = doc.new_object_id();
let cmap_stream_id = doc.add_object(Stream::new(
dictionary! {
"Length" => 437
},
b"/CIDInit /ProcSet findresource begin
12 dict begin
begincmap
/CIDSystemInfo
<< /Registry (Adobe)
/Ordering (UCS)
/Supplement 0
>> def
/CMapName /Adobe-Identity-UCS def
/CMapType 2 def
1 begincodespacerange
<0000> <FFFF>
endcodespacerange
2 beginbfrange
<0000> <005E> <0020>
<005F> <0061> [<D83DDE00> <D83DDD27> <D83DDD28>]
endbfrange
1 beginbfchar
<3A51> <D840DC3E>
endbfchar
endcmap
CMapName currentdict /CMap defineresource pop
end
end"
.to_vec(),
));
let font_id = doc.add_object(dictionary! {
"Type" => "Font",
"Subtype" => "Type0",
"BaseFont" => "Ryumin-Light",
"Encoding" => "Identity-H",
"ToUnicode" => Object::Reference(cmap_stream_id)
});
let resources_id = doc.add_object(dictionary! {
"Font" => dictionary! {
"F1" => font_id,
},
});
let content = Content {
operations: vec![
Operation::new("BT", vec![]),
Operation::new("Tf", vec!["F1".into(), 48.into()]),
Operation::new("Td", vec![100.into(), 600.into()]),
Operation::new(
"Tj",
vec![Object::String(
literal_encoded_with_cmap.to_vec(),
StringFormat::Hexadecimal,
)],
),
Operation::new("ET", vec![]),
],
};
let content_id = doc.add_object(Stream::new(dictionary! {}, content.encode().unwrap()));
let page_id = doc.add_object(dictionary! {
"Type" => "Page",
"Parent" => pages_id,
"Contents" => content_id,
});
let pages = dictionary! {
"Type" => "Pages",
"Kids" => vec![page_id.into()],
"Count" => 1,
"Resources" => resources_id,
"MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
};
doc.objects.insert(pages_id, Object::Dictionary(pages));
let catalog_id = doc.add_object(dictionary! {
"Type" => "Catalog",
"Pages" => pages_id,
});
doc.trailer.set("Root", catalog_id);
doc.compress();
let extracted_text = get_text_from_first_page(&doc);
assert_eq!(extracted_text.trim_end(), unicode_literal);
}
fn get_text_from_first_page(doc: &Document) -> String {
let mut pages = doc.get_pages();
let first_page = pages.first_entry().expect("Expected pages to be non empty");
let extracted_text = doc
.extract_text(&[*first_page.key()])
.expect("Expected to find text on the first page");
extracted_text
}
#[cfg(not(feature = "async"))]
#[test]
fn unicode_can_be_extracted_from_loaded_pdf() -> lopdf::Result<()> {
let doc = Document::load("assets/unicode.pdf")?;
let extracted_text = get_text_from_first_page(&doc);
assert_eq!(extracted_text.replace("\n", ""), "😀🔧🔨");
Ok(())
}
#[cfg(feature = "async")]
#[tokio::test]
async fn unicode_can_be_extracted_from_loaded_pdf() -> lopdf::Result<()> {
let doc = Document::load("assets/unicode.pdf").await?;
let extracted_text = get_text_from_first_page(&doc);
assert_eq!(extracted_text.replace("\n", ""), "😀🔧🔨");
Ok(())
}