use lopdf::{Object, StringFormat};
use serde_derive::{Deserialize, Serialize};
#[derive(Debug, Clone, PartialEq, PartialOrd, Deserialize, Serialize)]
pub struct Codepoint {
pub gid: u16,
pub offset: f32,
pub cid: Option<String>,
}
impl Codepoint {
pub fn new(gid: u16, offset: f32) -> Self {
Self { gid, offset, cid: None }
}
pub fn with_cid(gid: u16, offset: f32, cid: String) -> Self {
Self { gid, offset, cid: Some(cid) }
}
}
#[derive(Debug, Clone, PartialEq, PartialOrd, Deserialize, Serialize)]
#[serde(untagged)]
pub enum TextItem {
Text(String),
Offset(f32),
GlyphIds(Vec<Codepoint>),
}
impl From<String> for TextItem {
fn from(s: String) -> Self {
TextItem::Text(s)
}
}
impl From<&str> for TextItem {
fn from(s: &str) -> Self {
TextItem::Text(s.to_string())
}
}
impl From<f32> for TextItem {
fn from(n: f32) -> Self {
TextItem::Offset(n)
}
}
impl From<f64> for TextItem {
fn from(n: f64) -> Self {
TextItem::Offset(n as f32)
}
}
impl From<i32> for TextItem {
fn from(n: i32) -> Self {
TextItem::Offset(n as f32)
}
}
impl From<i64> for TextItem {
fn from(n: i64) -> Self {
TextItem::Offset(n as f32)
}
}
impl From<u8> for TextItem {
fn from(n: u8) -> Self {
TextItem::Offset(n as f32)
}
}
pub trait CMap {
fn map_bytes(&self, bytes: &[u8]) -> String;
}
pub fn decode_pdf_string(obj: &Object, to_unicode: Option<&impl CMap>) -> String {
if let Object::String(ref bytes, format) = obj {
match format {
StringFormat::Literal => {
if let Some(cmap) = to_unicode {
cmap.map_bytes(bytes)
} else {
String::from_utf8_lossy(bytes).into_owned()
}
}
StringFormat::Hexadecimal => {
if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
let utf16_iter = bytes[2..].chunks(2).filter_map(|pair| {
if pair.len() == 2 {
Some(u16::from_be_bytes([pair[0], pair[1]]))
} else {
None
}
});
String::from_utf16(&utf16_iter.collect::<Vec<_>>()).unwrap_or_default()
} else {
if let Some(cmap) = to_unicode {
cmap.map_bytes(bytes)
} else {
String::from_utf8_lossy(bytes).into_owned()
}
}
}
}
} else {
String::new()
}
}
pub fn decode_tj_operands(operands: &[Object], to_unicode: Option<&impl CMap>) -> Vec<TextItem> {
let mut items = Vec::new();
for obj in operands {
match obj {
Object::String(_, _) => {
let s = decode_pdf_string(obj, to_unicode);
items.push(TextItem::Text(s));
}
Object::Integer(i) => {
items.push(TextItem::Offset(*i as f32));
}
Object::Real(r) => {
items.push(TextItem::Offset(*r as f32));
}
_ => {
}
}
}
items
}
pub fn decode_tj_operands_as_glyph_ids(operands: &[Object]) -> Vec<TextItem> {
let mut items = Vec::new();
let mut current_glyphs = Vec::new();
for obj in operands {
match obj {
Object::String(bytes, _) => {
if bytes.len() >= 2 && bytes.len() % 2 == 0 {
for chunk in bytes.chunks(2) {
if chunk.len() == 2 {
let gid = u16::from_be_bytes([chunk[0], chunk[1]]);
current_glyphs.push(Codepoint::new(gid, 0.0));
}
}
} else {
for &byte in bytes {
current_glyphs.push(Codepoint::new(byte as u16, 0.0));
}
}
}
Object::Integer(i) => {
if !current_glyphs.is_empty() {
items.push(TextItem::GlyphIds(std::mem::take(&mut current_glyphs)));
}
items.push(TextItem::Offset(*i as f32));
}
Object::Real(r) => {
if !current_glyphs.is_empty() {
items.push(TextItem::GlyphIds(std::mem::take(&mut current_glyphs)));
}
items.push(TextItem::Offset(*r as f32));
}
_ => {
}
}
}
if !current_glyphs.is_empty() {
items.push(TextItem::GlyphIds(current_glyphs));
}
items
}
pub fn decode_tj_string_as_glyph_ids(bytes: &[u8]) -> Vec<TextItem> {
let mut glyphs = Vec::new();
if bytes.len() >= 2 && bytes.len() % 2 == 0 {
for chunk in bytes.chunks(2) {
if chunk.len() == 2 {
let gid = u16::from_be_bytes([chunk[0], chunk[1]]);
glyphs.push(Codepoint::new(gid, 0.0));
}
}
} else {
for &byte in bytes {
glyphs.push(Codepoint::new(byte as u16, 0.0));
}
}
if glyphs.is_empty() {
vec![]
} else {
vec![TextItem::GlyphIds(glyphs)]
}
}
pub fn encode_pdf_string_literal(s: &str) -> String {
let mut result = String::with_capacity(s.len() + 2);
result.push('(');
for c in s.chars() {
match c {
'(' => result.push_str("\\("),
')' => result.push_str("\\)"),
'\\' => result.push_str("\\\\"),
'\n' => result.push_str("\\n"),
'\r' => result.push_str("\\r"),
'\t' => result.push_str("\\t"),
'\x08' => result.push_str("\\b"),
'\x0C' => result.push_str("\\f"),
_ => result.push(c),
}
}
result.push(')');
result
}
pub fn encode_pdf_string_hex(s: &str) -> String {
let mut utf16: Vec<u16> = Vec::new();
utf16.push(0xFEFF); utf16.extend(s.encode_utf16());
let mut bytes = Vec::with_capacity(utf16.len() * 2);
for code in utf16 {
bytes.extend_from_slice(&code.to_be_bytes());
}
let hex: String = bytes.iter().map(|b| format!("{:02X}", b)).collect();
format!("<{}>", hex)
}
pub fn encode_pdf_string_minimal(s: &str) -> String {
let literal = encode_pdf_string_literal(s);
let hex = encode_pdf_string_hex(s);
if literal.len() <= hex.len() {
literal
} else {
hex
}
}
pub fn encode_text_items(items: &[TextItem]) -> Vec<Object> {
let mut objs = Vec::new();
for item in items {
match item {
TextItem::Text(s) => {
let pdf_str = encode_pdf_string_minimal(s);
if pdf_str.starts_with('<') {
let inner = &pdf_str[1..pdf_str.len() - 1];
let mut bytes = Vec::new();
for i in (0..inner.len()).step_by(2) {
if i + 2 <= inner.len() {
if let Ok(byte) = u8::from_str_radix(&inner[i..i + 2], 16) {
bytes.push(byte);
}
}
}
objs.push(Object::String(bytes, StringFormat::Hexadecimal));
} else {
objs.push(Object::String(s.as_bytes().to_vec(), StringFormat::Literal));
}
}
TextItem::Offset(n) => {
objs.push(Object::Integer(*n as i64));
}
TextItem::GlyphIds(glyphs) => {
for codepoint in glyphs {
let bytes = codepoint.gid.to_be_bytes().to_vec();
objs.push(Object::String(bytes, StringFormat::Hexadecimal));
if codepoint.offset != 0.0 {
objs.push(Object::Integer(codepoint.offset as i64));
}
}
}
}
}
objs
}