use crate::config::PageNumber;
use regex::Regex;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::LazyLock;
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
pub enum BlockType {
#[default]
Body,
Caption,
Header,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct RichText {
pub original: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub math_marked: Option<String>,
}
const HYPHENATED_SUFFIXES: &[&str] = &[
"based",
"driven",
"oriented",
"aware",
"agnostic",
"independent",
"dependent",
"first",
"native",
"centric",
"intensive",
"bound",
"safe",
"free",
"proof",
"efficient",
"optimized",
"enabled",
"powered",
"ready",
"capable",
"compatible",
"compliant",
"level",
"scale",
"wide",
"specific",
"friendly",
"facing",
"like",
"style",
];
static SUFFIX_REGEXES: LazyLock<Vec<(Regex, &'static str)>> = LazyLock::new(|| {
let mut suffixes: Vec<&str> = HYPHENATED_SUFFIXES.to_vec();
suffixes.sort_by(|a, b| b.len().cmp(&a.len()));
suffixes
.into_iter()
.map(|suffix| {
let pattern = format!(r"\b[A-Za-z]+{}\b", suffix);
(Regex::new(&pattern).unwrap(), suffix)
})
.collect()
});
pub fn fix_suffix_hyphens(text: &str) -> String {
let mut result = text.to_string();
for (regex, suffix) in SUFFIX_REGEXES.iter() {
let current = result.clone();
result = regex
.replace_all(¤t, |caps: ®ex::Captures| {
let m = caps.get(0).unwrap();
let matched = m.as_str();
let start_pos = m.start();
if start_pos > 0 {
let prev_in_text = current.as_bytes()[start_pos - 1] as char;
if prev_in_text == '-' {
return matched.to_string();
}
}
let suffix_pos = matched.len() - suffix.len();
if suffix_pos > 0 {
let prev_char = matched.as_bytes()[suffix_pos - 1] as char;
if prev_char != '-' && prev_char != ' ' {
let (head, _) = matched.split_at(suffix_pos);
return format!("{}-{}", head, suffix);
}
}
matched.to_string()
})
.to_string();
}
result
}
#[derive(Debug, Clone, PartialEq)]
pub struct Word {
pub text: String,
pub x: f32,
pub y: f32,
pub width: f32,
pub height: f32,
}
impl Word {
pub fn font_size(&self) -> f32 {
return self.height;
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct Line {
pub words: Vec<Word>,
pub x: f32,
pub y: f32,
pub width: f32,
pub height: f32,
}
impl Line {
pub fn new(x: f32, y: f32, width: f32, height: f32) -> Line {
Line {
words: Vec::new(),
x: x,
y: y,
width: width,
height: height,
}
}
pub fn add_word(&mut self, text: String, x: f32, y: f32, width: f32, height: f32) {
self.words.push(Word {
text: text.trim().to_string(),
x: x,
y: y,
width: width,
height: height,
});
}
pub fn get_text(&self) -> String {
let mut words = Vec::new();
for word in &self.words {
words.push(word.text.clone());
}
return words.join(" ");
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct Block {
pub lines: Vec<Line>,
pub x: f32,
pub y: f32,
pub width: f32,
pub height: f32,
pub section: String,
pub block_type: BlockType,
}
impl Block {
pub fn new(x: f32, y: f32, width: f32, height: f32) -> Block {
Block {
lines: Vec::new(),
x: x,
y: y,
width: width,
height: height,
section: String::new(),
block_type: BlockType::default(),
}
}
pub fn add_line(&mut self, x: f32, y: f32, width: f32, height: f32) {
self.lines.push(Line::new(x, y, width, height));
}
pub fn get_text(&self) -> String {
let mut text = String::new();
for line in &self.lines {
text = text.trim().to_string();
if text.ends_with("-") {
text = text.trim().trim_end_matches("-").to_string();
} else {
text.push_str(" ");
}
text.push_str(&line.get_text());
}
text = fix_suffix_hyphens(&text);
return text.trim().to_string();
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct Page {
pub blocks: Vec<Block>,
pub width: f32,
pub height: f32,
pub tables: Vec<Coordinate>,
pub page_number: PageNumber,
pub number_of_columns: i8,
}
impl Page {
pub fn new(width: f32, height: f32, page_number: PageNumber) -> Page {
Page {
blocks: Vec::new(),
width: width,
height: height,
tables: Vec::new(),
page_number,
number_of_columns: 1,
}
}
pub fn add_block(&mut self, x: f32, y: f32, width: f32, height: f32) {
self.blocks.push(Block::new(x, y, width, height));
}
pub fn get_text(&self) -> String {
let mut text = String::new();
for block in &self.blocks {
text.push_str(&block.get_text());
text.push_str("\n\n");
}
return text;
}
pub fn top(&self) -> Option<f32> {
let mut values: Vec<f32> = Vec::new();
for block in &self.blocks {
for line in &block.lines {
values.push(line.y);
}
}
if values.is_empty() {
return None;
}
values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
values.first().copied()
}
pub fn bottom(&self) -> Option<f32> {
let mut values: Vec<f32> = Vec::new();
for block in &self.blocks {
for line in &block.lines {
values.push(line.y + line.height);
}
}
if values.is_empty() {
return None;
}
values.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
values.first().copied()
}
pub fn left(&self) -> Option<f32> {
let mut values: Vec<f32> = Vec::new();
for block in &self.blocks {
for line in &block.lines {
values.push(line.x);
}
}
if values.is_empty() {
return None;
}
values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
values.first().copied()
}
pub fn right(&self) -> Option<f32> {
let mut values: Vec<f32> = Vec::new();
for block in &self.blocks {
for line in &block.lines {
values.push(line.x + line.width);
}
}
if values.is_empty() {
return None;
}
values.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
values.first().copied()
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Point {
pub x: f32,
pub y: f32,
}
impl Point {
pub fn new(x: f32, y: f32) -> Point {
Point { x: x, y: y }
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Coordinate {
pub top_left: Point,
pub top_right: Point,
pub bottom_left: Point,
pub bottom_right: Point,
}
impl Coordinate {
pub fn from_rect(x1: f32, y1: f32, x2: f32, y2: f32) -> Coordinate {
Coordinate {
top_left: Point { x: x1, y: y1 },
top_right: Point { x: x2, y: y1 },
bottom_left: Point { x: x1, y: y2 },
bottom_right: Point { x: x2, y: y2 },
}
}
pub fn from_object(x: f32, y: f32, width: f32, height: f32) -> Coordinate {
Coordinate {
top_left: Point { x: x, y: y },
top_right: Point { x: x + width, y: y },
bottom_left: Point {
x: x,
y: y + height,
},
bottom_right: Point {
x: x + width,
y: y + height,
},
}
}
pub fn width(&self) -> f32 {
return self.top_right.x - self.top_left.x;
}
pub fn height(&self) -> f32 {
return self.bottom_left.y - self.top_left.y;
}
pub fn is_intercept(&self, other: &Coordinate) -> bool {
if self.top_left.x >= other.bottom_right.x || self.bottom_right.x <= other.top_left.x {
return false;
}
if self.top_left.y >= other.bottom_right.y || self.bottom_right.y <= other.top_left.y {
return false;
}
return true;
}
pub fn get_area(&self) -> f32 {
return self.width() * self.height();
}
pub fn intersection(&self, other: &Coordinate) -> Coordinate {
let x1 = f32::max(self.top_left.x, other.top_left.x);
let y1 = f32::max(self.top_left.y, other.top_left.y);
let x2 = f32::min(self.bottom_right.x, other.bottom_right.x);
let y2 = f32::min(self.bottom_right.y, other.bottom_right.y);
return Coordinate::from_rect(x1, y1, x2, y2);
}
pub fn iou(&self, other: &Coordinate) -> f32 {
let dx = f32::min(self.bottom_right.x, other.bottom_right.x)
- f32::max(self.top_left.x, other.top_left.x);
let dy = f32::min(self.bottom_right.y, other.bottom_right.y)
- f32::max(self.top_left.y, other.top_left.y);
if dx <= 0.0 || dy <= 0.0 {
return 0.0;
} else {
let area1 = self.width() * self.height();
let area2 = other.width() * other.height();
let inter_area = dx * dy;
return inter_area / (area1 + area2 - inter_area);
}
}
pub fn is_contained_in(&self, other: &Coordinate) -> bool {
let iou = self.iou(other);
let intersection = self.intersection(other).get_area();
let self_area = self.get_area();
return iou > 0.0 && intersection / self_area > 0.3;
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct TextBlock {
pub text: String,
pub coordinates: Coordinate,
}
pub struct TextBlockReference {
pub text: String,
pub coordinates: Coordinate,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Reference {
#[serde(skip_serializing_if = "Option::is_none")]
pub raw_text: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub authors: Option<Vec<String>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub title: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub year: Option<i32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub venue: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub doi: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub url: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub arxiv_id: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub volume: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub pages: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct PaperOutput {
pub sections: Vec<Section>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub references: Vec<Reference>,
}
impl TextBlock {
pub fn from_block(block: &Block) -> TextBlock {
TextBlock {
text: block.get_text(),
coordinates: Coordinate::from_object(block.x, block.y, block.width, block.height),
}
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Section {
pub index: i16,
pub title: String,
pub contents: Vec<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub math_contents: Option<Vec<String>>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub captions: Vec<String>,
}
impl Section {
pub fn from_pages(pages: &Vec<Page>) -> Vec<Section> {
let mut section_indices: HashMap<String, i16> = HashMap::new();
let mut section_map: HashMap<String, Vec<String>> = HashMap::new();
let mut caption_map: HashMap<String, Vec<String>> = HashMap::new();
let mut last_text = String::new();
let eos_ptn = regex::Regex::new(r"(\.)(\W)").unwrap();
let ex_ws_ptn = regex::Regex::new(r"\s+").unwrap();
for page in pages {
for block in &page.blocks {
let keys = section_map.keys().cloned().collect::<Vec<String>>();
let mut text_block = block.get_text().trim().to_string();
if text_block.ends_with("-") {
last_text.push_str(&text_block.trim_end_matches("-"));
continue;
}
if !last_text.is_empty() {
last_text.push_str(&text_block);
text_block = last_text.clone();
last_text.clear();
}
text_block = eos_ptn.replace_all(&text_block, "$1 $2").to_string();
text_block = ex_ws_ptn.replace_all(&text_block, " ").to_string();
let is_caption = block.block_type == BlockType::Caption;
if is_caption {
caption_map
.entry(block.section.clone())
.or_insert_with(Vec::new)
.push(text_block);
if !section_indices.contains_key(&block.section) {
section_indices.insert(block.section.clone(), section_indices.len() as i16);
}
} else {
if keys.contains(&block.section) {
let content = section_map.get_mut(&block.section).unwrap();
content.push(text_block);
} else {
section_map.insert(block.section.clone(), vec![text_block]);
section_indices.insert(block.section.clone(), section_indices.len() as i16);
}
}
}
}
let mut sections = Vec::new();
for (title, contents) in section_map {
let captions = caption_map.remove(&title).unwrap_or_default();
sections.push(Section {
index: section_indices.get(&title).copied().unwrap_or(0),
title: title,
contents: contents,
math_contents: None, captions: captions,
});
}
for (title, captions) in caption_map {
sections.push(Section {
index: section_indices.get(&title).copied().unwrap_or(0),
title: title,
contents: Vec::new(),
math_contents: None,
captions: captions,
});
}
sections.sort_by(|a, b| a.index.cmp(&b.index));
return sections;
}
pub fn from_pages_with_math(
pages: &Vec<Page>,
math_texts: &HashMap<(crate::config::PageNumber, usize), String>,
) -> Vec<Section> {
let mut section_indices: HashMap<String, i16> = HashMap::new();
let mut section_map: HashMap<String, Vec<String>> = HashMap::new();
let mut math_section_map: HashMap<String, Vec<String>> = HashMap::new();
let mut caption_map: HashMap<String, Vec<String>> = HashMap::new();
let mut last_text = String::new();
let mut last_math_text = String::new();
let eos_ptn = regex::Regex::new(r"(\.)(\W)").unwrap();
let ex_ws_ptn = regex::Regex::new(r"\s+").unwrap();
for page in pages {
for (block_idx, block) in page.blocks.iter().enumerate() {
let keys = section_map.keys().cloned().collect::<Vec<String>>();
let mut text_block = block.get_text().trim().to_string();
let math_text = math_texts
.get(&(page.page_number, block_idx))
.cloned()
.unwrap_or_else(|| text_block.clone());
let mut math_block = math_text.trim().to_string();
if text_block.ends_with("-") {
last_text.push_str(&text_block.trim_end_matches("-"));
last_math_text.push_str(&math_block.trim_end_matches("-"));
continue;
}
if !last_text.is_empty() {
last_text.push_str(&text_block);
text_block = last_text.clone();
last_text.clear();
last_math_text.push_str(&math_block);
math_block = last_math_text.clone();
last_math_text.clear();
}
text_block = eos_ptn.replace_all(&text_block, "$1 $2").to_string();
text_block = ex_ws_ptn.replace_all(&text_block, " ").to_string();
math_block = eos_ptn.replace_all(&math_block, "$1 $2").to_string();
math_block = ex_ws_ptn.replace_all(&math_block, " ").to_string();
let is_caption = block.block_type == BlockType::Caption;
if is_caption {
caption_map
.entry(block.section.clone())
.or_insert_with(Vec::new)
.push(text_block);
if !section_indices.contains_key(&block.section) {
section_indices.insert(block.section.clone(), section_indices.len() as i16);
}
} else {
if keys.contains(&block.section) {
section_map.get_mut(&block.section).unwrap().push(text_block);
math_section_map.get_mut(&block.section).unwrap().push(math_block);
} else {
section_map.insert(block.section.clone(), vec![text_block]);
math_section_map.insert(block.section.clone(), vec![math_block]);
section_indices.insert(block.section.clone(), section_indices.len() as i16);
}
}
}
}
let mut sections = Vec::new();
for (title, contents) in section_map {
let captions = caption_map.remove(&title).unwrap_or_default();
let math_contents = math_section_map.remove(&title);
let has_math = math_contents.as_ref().map_or(false, |mc| {
mc.iter().zip(contents.iter()).any(|(m, c)| m != c)
});
sections.push(Section {
index: section_indices.get(&title).copied().unwrap_or(0),
title: title,
contents: contents,
math_contents: if has_math { math_contents } else { None },
captions: captions,
});
}
for (title, captions) in caption_map {
sections.push(Section {
index: section_indices.get(&title).copied().unwrap_or(0),
title: title,
contents: Vec::new(),
math_contents: None,
captions: captions,
});
}
sections.sort_by(|a, b| a.index.cmp(&b.index));
return sections;
}
pub fn get_text(&self) -> String {
if self.contents.len() == 0 {
return String::new();
} else {
return self.contents.join("\n");
}
}
pub fn get_math_text(&self) -> String {
if let Some(ref math) = self.math_contents {
if !math.is_empty() {
return math.join("\n");
}
}
self.get_text()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_fix_suffix_hyphens_direct_connection() {
assert_eq!(fix_suffix_hyphens("databased"), "data-based");
assert_eq!(fix_suffix_hyphens("modelbased"), "model-based");
assert_eq!(fix_suffix_hyphens("eventdriven"), "event-driven");
assert_eq!(fix_suffix_hyphens("datadriven"), "data-driven");
assert_eq!(fix_suffix_hyphens("objectoriented"), "object-oriented");
assert_eq!(fix_suffix_hyphens("contextaware"), "context-aware");
assert_eq!(fix_suffix_hyphens("userfriendly"), "user-friendly");
assert_eq!(fix_suffix_hyphens("domainspecific"), "domain-specific");
}
#[test]
fn test_fix_suffix_hyphens_already_hyphenated() {
assert_eq!(fix_suffix_hyphens("data-based"), "data-based");
assert_eq!(fix_suffix_hyphens("event-driven"), "event-driven");
assert_eq!(fix_suffix_hyphens("object-oriented"), "object-oriented");
assert_eq!(fix_suffix_hyphens("context-aware"), "context-aware");
assert_eq!(fix_suffix_hyphens("user-friendly"), "user-friendly");
assert_eq!(fix_suffix_hyphens("domain-specific"), "domain-specific");
}
#[test]
fn test_fix_suffix_hyphens_space_separated() {
assert_eq!(fix_suffix_hyphens("data based"), "data based");
assert_eq!(fix_suffix_hyphens("event driven"), "event driven");
assert_eq!(fix_suffix_hyphens("object oriented"), "object oriented");
assert_eq!(fix_suffix_hyphens("context aware"), "context aware");
}
#[test]
fn test_fix_suffix_hyphens_multiple_occurrences() {
assert_eq!(
fix_suffix_hyphens("This is a databased and eventdriven system."),
"This is a data-based and event-driven system."
);
assert_eq!(
fix_suffix_hyphens("userfriendly and domainspecific approach"),
"user-friendly and domain-specific approach"
);
}
#[test]
fn test_fix_suffix_hyphens_mixed_cases() {
assert_eq!(
fix_suffix_hyphens("data-based and eventdriven"),
"data-based and event-driven"
);
assert_eq!(
fix_suffix_hyphens("The modelbased approach is user-friendly."),
"The model-based approach is user-friendly."
);
}
#[test]
fn test_fix_suffix_hyphens_no_suffix() {
assert_eq!(fix_suffix_hyphens("hello world"), "hello world");
assert_eq!(fix_suffix_hyphens("simple text"), "simple text");
assert_eq!(fix_suffix_hyphens(""), "");
}
#[test]
fn test_fix_suffix_hyphens_suffix_alone() {
assert_eq!(fix_suffix_hyphens("based"), "based");
assert_eq!(fix_suffix_hyphens("driven"), "driven");
assert_eq!(fix_suffix_hyphens("oriented"), "oriented");
}
#[test]
fn test_fix_suffix_hyphens_all_suffixes() {
let test_cases = vec![
("databased", "data-based"),
("datadriven", "data-driven"),
("objectoriented", "object-oriented"),
("contextaware", "context-aware"),
("platformagnostic", "platform-agnostic"),
("platformindependent", "platform-independent"),
("pathdependent", "path-dependent"),
("mobilefirst", "mobile-first"),
("cloudnative", "cloud-native"),
("datacentric", "data-centric"),
("resourceintensive", "resource-intensive"),
("cpubound", "cpu-bound"),
("threadsafe", "thread-safe"),
("errorfree", "error-free"),
("futureproof", "future-proof"),
("energyefficient", "energy-efficient"),
("codeoptimized", "code-optimized"),
("aienabled", "ai-enabled"),
("aipowered", "ai-powered"),
("productionready", "production-ready"),
("gpucapable", "gpu-capable"),
("backwardcompatible", "backward-compatible"),
("fullycompliant", "fully-compliant"),
("lowlevel", "low-level"),
("largescale", "large-scale"),
("systemwide", "system-wide"),
("taskspecific", "task-specific"),
("userfriendly", "user-friendly"),
("customerfacing", "customer-facing"),
("shelllike", "shell-like"),
("pythonstyle", "python-style"),
];
for (input, expected) in test_cases {
assert_eq!(
fix_suffix_hyphens(input),
expected,
"Failed for input: {}",
input
);
}
}
}