use crate::extractors::text::ArtifactType;
use crate::geometry::{Point, Rect};
use std::collections::HashMap;
#[derive(Debug, Clone, serde::Serialize)]
#[cfg_attr(feature = "wasm", serde(rename_all = "camelCase"))]
pub struct TextSpan {
pub text: String,
pub bbox: Rect,
pub font_name: String,
pub font_size: f32,
pub font_weight: FontWeight,
pub is_italic: bool,
pub is_monospace: bool,
pub color: Color,
pub mcid: Option<u32>,
pub sequence: usize,
pub split_boundary_before: bool,
pub offset_semantic: bool,
pub char_spacing: f32,
pub word_spacing: f32,
pub horizontal_scaling: f32,
pub primary_detected: bool,
pub artifact_type: Option<ArtifactType>,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub char_widths: Vec<f32>,
}
impl Default for TextSpan {
fn default() -> Self {
Self {
text: String::new(),
bbox: Rect::default(),
font_name: "Helvetica".to_string(),
font_size: 12.0,
font_weight: FontWeight::Normal,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 0,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
artifact_type: None,
char_widths: Vec::new(),
}
}
}
impl TextSpan {
pub fn to_chars(&self) -> Vec<TextChar> {
let char_count = self.text.chars().count();
if char_count == 0 {
return Vec::new();
}
if self.char_widths.len() == char_count {
let mut x = self.bbox.x;
self.text
.chars()
.enumerate()
.map(|(i, c)| {
let w = self.char_widths[i];
let char_x = x;
x += w;
TextChar {
char: c,
bbox: Rect::new(char_x, self.bbox.y, w, self.bbox.height),
font_name: self.font_name.clone(),
font_size: self.font_size,
font_weight: self.font_weight,
is_italic: self.is_italic,
is_monospace: self.is_monospace,
color: self.color,
mcid: self.mcid,
origin_x: char_x,
origin_y: self.bbox.y,
rotation_degrees: 0.0,
advance_width: w,
matrix: Some([1.0, 0.0, 0.0, 1.0, 0.0, 0.0]),
}
})
.collect()
} else {
let char_width = self.bbox.width / (char_count as f32);
self.text
.chars()
.enumerate()
.map(|(i, c)| TextChar {
char: c,
bbox: Rect::new(
self.bbox.x + (i as f32) * char_width,
self.bbox.y,
char_width,
self.bbox.height,
),
font_name: self.font_name.clone(),
font_size: self.font_size,
font_weight: self.font_weight,
is_italic: self.is_italic,
is_monospace: self.is_monospace,
color: self.color,
mcid: self.mcid,
origin_x: self.bbox.x + (i as f32) * char_width,
origin_y: self.bbox.y,
rotation_degrees: 0.0,
advance_width: char_width,
matrix: Some([1.0, 0.0, 0.0, 1.0, 0.0, 0.0]),
})
.collect()
}
}
}
#[derive(Debug, Clone, serde::Serialize)]
#[cfg_attr(feature = "wasm", serde(rename_all = "camelCase"))]
pub struct TextChar {
pub char: char,
pub bbox: Rect,
pub font_name: String,
pub font_size: f32,
pub font_weight: FontWeight,
pub is_italic: bool,
pub is_monospace: bool,
pub color: Color,
pub mcid: Option<u32>,
pub origin_x: f32,
pub origin_y: f32,
pub rotation_degrees: f32,
pub advance_width: f32,
pub matrix: Option<[f32; 6]>,
}
impl Default for TextChar {
fn default() -> Self {
Self {
char: ' ',
bbox: Rect::default(),
font_name: "Helvetica".to_string(),
font_size: 12.0,
font_weight: FontWeight::Normal,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
origin_x: 0.0,
origin_y: 0.0,
rotation_degrees: 0.0,
advance_width: 0.0,
matrix: Some([1.0, 0.0, 0.0, 1.0, 0.0, 0.0]),
}
}
}
impl TextChar {
pub fn rotation_radians(&self) -> f32 {
self.rotation_degrees.to_radians()
}
pub fn is_rotated(&self) -> bool {
self.rotation_degrees.abs() > 0.01
}
pub fn with_matrix(mut self, matrix: [f32; 6]) -> Self {
self.matrix = Some(matrix);
self.origin_x = matrix[4];
self.origin_y = matrix[5];
self.rotation_degrees = matrix[1].atan2(matrix[0]).to_degrees();
self
}
pub fn get_matrix(&self) -> [f32; 6] {
if let Some(m) = self.matrix {
m
} else {
let rad = self.rotation_radians();
let cos_r = rad.cos();
let sin_r = rad.sin();
[cos_r, sin_r, -sin_r, cos_r, self.origin_x, self.origin_y]
}
}
pub fn simple(char: char, bbox: Rect, font_name: String, font_size: f32) -> Self {
Self {
char,
bbox,
font_name,
font_size,
font_weight: FontWeight::Normal,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
origin_x: bbox.x,
origin_y: bbox.y,
rotation_degrees: 0.0,
advance_width: bbox.width,
matrix: None,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, Default)]
#[repr(u16)]
pub enum FontWeight {
Thin = 100,
ExtraLight = 200,
Light = 300,
#[default]
Normal = 400,
Medium = 500,
SemiBold = 600,
Bold = 700,
ExtraBold = 800,
Black = 900,
}
impl FontWeight {
pub fn is_bold(&self) -> bool {
*self as u16 >= 600
}
pub fn from_pdf_value(value: i32) -> Self {
match value {
..=150 => FontWeight::Thin,
151..=250 => FontWeight::ExtraLight,
251..=350 => FontWeight::Light,
351..=450 => FontWeight::Normal,
451..=550 => FontWeight::Medium,
551..=650 => FontWeight::SemiBold,
651..=750 => FontWeight::Bold,
751..=850 => FontWeight::ExtraBold,
851.. => FontWeight::Black,
}
}
pub fn to_pdf_value(&self) -> u16 {
*self as u16
}
}
#[derive(Debug, Clone, Copy, PartialEq, serde::Serialize, Default)]
pub struct Color {
pub r: f32,
pub g: f32,
pub b: f32,
}
impl Color {
pub fn new(r: f32, g: f32, b: f32) -> Self {
Self { r, g, b }
}
pub fn black() -> Self {
Self::new(0.0, 0.0, 0.0)
}
pub fn white() -> Self {
Self::new(1.0, 1.0, 1.0)
}
}
#[derive(Debug, Clone, serde::Serialize)]
#[cfg_attr(feature = "wasm", serde(rename_all = "camelCase"))]
pub struct PageText {
pub spans: Vec<TextSpan>,
pub chars: Vec<TextChar>,
pub page_width: f32,
pub page_height: f32,
}
#[derive(Debug, Clone, serde::Serialize)]
#[cfg_attr(feature = "wasm", serde(rename_all = "camelCase"))]
pub struct TextBlock {
pub chars: Vec<TextChar>,
pub bbox: Rect,
pub text: String,
pub avg_font_size: f32,
pub dominant_font: String,
pub is_bold: bool,
pub is_italic: bool,
pub mcid: Option<u32>,
}
impl TextBlock {
pub fn from_chars(chars: Vec<TextChar>) -> Self {
assert!(!chars.is_empty(), "Cannot create TextBlock from empty chars");
let text: String = chars.iter().map(|c| c.char).collect();
let bbox = chars
.iter()
.map(|c| c.bbox)
.fold(chars[0].bbox, |acc, r| acc.union(&r));
let avg_font_size = chars.iter().map(|c| c.font_size).sum::<f32>() / chars.len() as f32;
let mut font_counts = HashMap::new();
for c in &chars {
*font_counts.entry(c.font_name.clone()).or_insert(0) += 1;
}
let dominant_font = font_counts
.iter()
.max_by_key(|(_, count)| *count)
.map(|(font, _)| font.clone())
.unwrap_or_default();
let is_bold = chars.iter().any(|c| c.font_weight.is_bold());
let is_italic = chars.iter().any(|c| c.is_italic);
let mcid = chars
.first()
.and_then(|c| c.mcid)
.filter(|&first_mcid| chars.iter().all(|c| c.mcid == Some(first_mcid)));
Self {
chars,
bbox,
text,
avg_font_size,
dominant_font,
is_bold,
is_italic,
mcid,
}
}
pub fn center(&self) -> Point {
self.bbox.center()
}
pub fn is_horizontally_aligned(&self, other: &TextBlock, tolerance: f32) -> bool {
(self.bbox.y - other.bbox.y).abs() < tolerance
}
pub fn is_vertically_aligned(&self, other: &TextBlock, tolerance: f32) -> bool {
(self.bbox.x - other.bbox.x).abs() < tolerance
}
}
pub type Word = TextBlock;
#[derive(Debug, Clone, serde::Serialize)]
#[cfg_attr(feature = "wasm", serde(rename_all = "camelCase"))]
pub struct TextLine {
pub words: Vec<Word>,
pub bbox: Rect,
pub text: String,
}
impl TextLine {
pub fn new(words: Vec<Word>) -> Self {
assert!(!words.is_empty(), "Cannot create TextLine from empty words");
let bbox = words
.iter()
.map(|w| w.bbox)
.fold(words[0].bbox, |acc, r| acc.union(&r));
let text = words
.iter()
.map(|w| w.text.as_str())
.collect::<Vec<_>>()
.join(" ");
Self { words, bbox, text }
}
}
#[cfg(test)]
mod tests {
use super::*;
fn mock_char(c: char, x: f32, y: f32) -> TextChar {
let bbox = Rect::new(x, y, 10.0, 12.0);
TextChar {
char: c,
bbox,
font_name: "Times".to_string(),
font_size: 12.0,
font_weight: FontWeight::Normal,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
origin_x: bbox.x,
origin_y: bbox.y,
rotation_degrees: 0.0,
advance_width: bbox.width,
matrix: None,
}
}
#[test]
fn test_text_block_from_chars() {
let chars = vec![
mock_char('H', 0.0, 0.0),
mock_char('e', 10.0, 0.0),
mock_char('l', 20.0, 0.0),
mock_char('l', 30.0, 0.0),
mock_char('o', 40.0, 0.0),
];
let block = TextBlock::from_chars(chars);
assert_eq!(block.text, "Hello");
assert_eq!(block.avg_font_size, 12.0);
}
#[test]
fn test_text_span_is_monospace_default() {
let span = TextSpan::default();
assert!(!span.is_monospace, "Default spans should not be monospace");
}
#[test]
fn test_text_span_is_monospace_set() {
let span = TextSpan {
is_monospace: true,
text: "AB".to_string(),
bbox: Rect::new(0.0, 0.0, 20.0, 12.0),
..TextSpan::default()
};
assert!(span.is_monospace);
let chars = span.to_chars();
for c in &chars {
assert!(c.is_monospace, "TextChar should inherit is_monospace from span");
}
}
#[test]
fn test_text_char_is_monospace() {
let c = TextChar {
char: 'A',
bbox: Rect::new(0.0, 0.0, 10.0, 12.0),
font_name: "Courier".to_string(),
font_size: 12.0,
font_weight: FontWeight::Normal,
is_italic: false,
is_monospace: true,
color: Color::black(),
mcid: None,
origin_x: 0.0,
origin_y: 0.0,
rotation_degrees: 0.0,
advance_width: 10.0,
matrix: None,
};
assert!(c.is_monospace);
}
#[test]
fn test_to_chars_uses_char_widths_when_available() {
let span = TextSpan {
text: "AB".to_string(),
bbox: Rect::new(10.0, 20.0, 30.0, 12.0),
char_widths: vec![10.0, 20.0],
..TextSpan::default()
};
let chars = span.to_chars();
assert_eq!(chars.len(), 2);
assert!((chars[0].bbox.x - 10.0).abs() < 0.001);
assert!((chars[0].bbox.width - 10.0).abs() < 0.001);
assert!((chars[0].advance_width - 10.0).abs() < 0.001);
assert!((chars[1].bbox.x - 20.0).abs() < 0.001);
assert!((chars[1].bbox.width - 20.0).abs() < 0.001);
assert!((chars[1].advance_width - 20.0).abs() < 0.001);
}
#[test]
fn test_to_chars_falls_back_to_uniform_when_no_widths() {
let span = TextSpan {
text: "AB".to_string(),
bbox: Rect::new(10.0, 20.0, 30.0, 12.0),
..TextSpan::default()
};
let chars = span.to_chars();
assert_eq!(chars.len(), 2);
assert!((chars[0].bbox.width - 15.0).abs() < 0.001);
assert!((chars[1].bbox.width - 15.0).abs() < 0.001);
assert!((chars[0].bbox.x - 10.0).abs() < 0.001);
assert!((chars[1].bbox.x - 25.0).abs() < 0.001);
}
#[test]
fn test_to_chars_handles_mismatched_widths_gracefully() {
let span = TextSpan {
text: "ABC".to_string(),
bbox: Rect::new(0.0, 0.0, 30.0, 12.0),
char_widths: vec![5.0, 10.0], ..TextSpan::default()
};
let chars = span.to_chars();
assert_eq!(chars.len(), 3);
assert!((chars[0].bbox.width - 10.0).abs() < 0.001);
assert!((chars[1].bbox.width - 10.0).abs() < 0.001);
assert!((chars[2].bbox.width - 10.0).abs() < 0.001);
}
}