#![allow(
clippy::unnecessary_wraps,
clippy::unused_self,
clippy::manual_pattern_char_comparison
)]
use crate::document::text_utils::{
TextSanitizer, calculate_section_level, extract_section_number, is_likely_heading,
};
use crate::document::types::*;
use crate::document::types_extended::*;
use crate::error::Result;
#[derive(Debug)]
pub struct PageAssemblerOptions {
pub enable_text_sanitization: bool,
pub enable_heading_detection: bool,
pub enable_list_detection: bool,
pub merge_adjacent_text: bool,
}
impl Default for PageAssemblerOptions {
fn default() -> Self {
Self {
enable_text_sanitization: true,
enable_heading_detection: true,
enable_list_detection: true,
merge_adjacent_text: true,
}
}
}
#[derive(Debug)]
pub struct PageAssembler {
options: PageAssemblerOptions,
sanitizer: TextSanitizer,
}
impl PageAssembler {
pub fn new(options: PageAssemblerOptions) -> Self {
Self {
options,
sanitizer: TextSanitizer::new(),
}
}
pub fn assemble(&self, clusters: &[Cluster]) -> Result<Vec<DocItem>> {
let mut items = Vec::new();
for cluster in clusters {
let doc_items = self.process_cluster(cluster)?;
items.extend(doc_items);
}
if self.options.merge_adjacent_text {
items = self.merge_adjacent_text_items(items)?;
}
Ok(items)
}
fn process_cluster(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
match cluster.label {
DocItemLabel::Title => self.process_title(cluster),
DocItemLabel::SectionHeader => self.process_section_header(cluster),
DocItemLabel::Paragraph | DocItemLabel::Text => self.process_text(cluster),
DocItemLabel::ListItem => self.process_list_item(cluster),
DocItemLabel::Caption => self.process_caption(cluster),
DocItemLabel::Footnote => self.process_footnote(cluster),
DocItemLabel::PageHeader | DocItemLabel::PageFooter => {
self.process_header_footer(cluster)
}
DocItemLabel::Table => self.process_table(cluster),
DocItemLabel::Picture | DocItemLabel::Figure => self.process_picture(cluster),
DocItemLabel::Code => self.process_code(cluster),
DocItemLabel::Formula => self.process_formula(cluster),
DocItemLabel::CheckboxSelected | DocItemLabel::CheckboxUnselected => {
self.process_checkbox(cluster)
}
}
}
fn extract_text(&self, cluster: &Cluster) -> String {
let mut cells = cluster.cells.clone();
cells.sort_by(|a, b| {
let y_cmp = a.bbox.t.partial_cmp(&b.bbox.t).unwrap();
if y_cmp == std::cmp::Ordering::Equal {
a.bbox.l.partial_cmp(&b.bbox.l).unwrap()
} else {
y_cmp
}
});
let mut text = String::new();
let mut prev_x_end = 0.0;
let mut prev_y = 0.0;
for cell in &cells {
let gap_x = cell.bbox.l - prev_x_end;
let gap_y = (cell.bbox.t - prev_y).abs();
let cell_width = cell.bbox.r - cell.bbox.l;
if prev_y > 0.0 && gap_y > 5.0 {
if !text.ends_with('\n') {
text.push('\n');
}
}
else if prev_x_end > 0.0
&& gap_x > (cell_width * 0.3)
&& !text.ends_with(' ')
&& !text.ends_with('\n')
{
text.push(' ');
}
text.push_str(&cell.text);
prev_x_end = cell.bbox.r;
prev_y = cell.bbox.t;
}
if self.options.enable_text_sanitization {
self.sanitizer.sanitize(&text)
} else {
text
}
}
fn process_title(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
Ok(vec![DocItem::Title(TextItem {
text,
formatting: None,
label: DocItemLabel::Title,
})])
}
fn process_section_header(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
let level = if let Some(section_num) = extract_section_number(&text) {
calculate_section_level(§ion_num)
} else {
2
};
Ok(vec![DocItem::SectionHeader(SectionHeaderItem {
text,
level,
formatting: None,
})])
}
fn process_text(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
if self.options.enable_heading_detection && is_likely_heading(&text) {
let level = if let Some(section_num) = extract_section_number(&text) {
calculate_section_level(§ion_num)
} else {
2
};
Ok(vec![DocItem::SectionHeader(SectionHeaderItem {
text,
level,
formatting: None,
})])
} else {
Ok(vec![DocItem::Paragraph(TextItem {
text,
formatting: None,
label: DocItemLabel::Paragraph,
})])
}
}
fn process_list_item(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
let (marker, enumerated) = self.detect_list_marker(&text);
let text_without_marker = if let Some(m) = &marker {
text.trim_start_matches(m).trim_start().to_string()
} else {
text
};
Ok(vec![DocItem::ListItem(ListItemData {
text: text_without_marker,
marker: marker.unwrap_or_else(|| "-".to_string()),
enumerated,
level: 0, })])
}
fn detect_list_marker(&self, text: &str) -> (Option<String>, bool) {
let trimmed = text.trim_start();
if trimmed.starts_with("- ") || trimmed.starts_with("• ") || trimmed.starts_with("· ") {
return (Some(trimmed.chars().next().unwrap().to_string()), false);
}
if let Some(pos) = trimmed.find(|c| c == '.' || c == ')') {
if pos > 0 && trimmed[..pos].chars().all(|c| c.is_numeric()) {
let marker = &trimmed[..=pos];
return (Some(marker.to_string()), true);
}
}
(None, false)
}
fn process_caption(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
Ok(vec![DocItem::Paragraph(TextItem {
text,
formatting: Some(Formatting {
italic: true,
..Default::default()
}),
label: DocItemLabel::Caption,
})])
}
fn process_footnote(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
Ok(vec![DocItem::Paragraph(TextItem {
text,
formatting: None,
label: DocItemLabel::Footnote,
})])
}
fn process_header_footer(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
Ok(vec![DocItem::Paragraph(TextItem {
text,
formatting: None,
label: cluster.label,
})])
}
fn process_table(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
let table_data = TableData {
num_rows: 1,
num_cols: 1,
grid: vec![vec![TableCell {
text,
row_span: 1,
col_span: 1,
}]],
};
Ok(vec![DocItem::Table(TableItem {
data: table_data,
caption: None,
})])
}
fn process_picture(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = if !cluster.cells.is_empty() {
Some(self.extract_text(cluster))
} else {
None
};
Ok(vec![DocItem::Picture(PictureItem {
caption: text,
placeholder: format!(
"<!-- Figure at ({}, {}) -->",
cluster.bbox.l, cluster.bbox.t
),
})])
}
fn process_code(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
let language = self.detect_code_language(&text);
Ok(vec![DocItem::Code(CodeItem { text, language })])
}
fn detect_code_language(&self, text: &str) -> Option<String> {
if text.contains("def ") || text.contains("import ") || text.contains("print(") {
Some("python".to_string())
} else if text.contains("function ") || text.contains("const ") || text.contains("let ") {
Some("javascript".to_string())
} else if text.contains("fn ") || text.contains("impl ") || text.contains("pub ") {
Some("rust".to_string())
} else {
None
}
}
fn process_formula(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
let is_inline = text.len() < 50;
Ok(vec![DocItem::Formula(FormulaItem { text, is_inline })])
}
fn process_checkbox(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
let checked = cluster.label == DocItemLabel::CheckboxSelected;
let marker = if checked { "[x]" } else { "[ ]" };
Ok(vec![DocItem::ListItem(ListItemData {
text,
marker: marker.to_string(),
enumerated: false,
level: 0,
})])
}
fn merge_adjacent_text_items(&self, items: Vec<DocItem>) -> Result<Vec<DocItem>> {
if items.len() < 2 {
return Ok(items);
}
let mut merged = Vec::new();
let mut current_text: Option<String> = None;
let mut current_label: Option<DocItemLabel> = None;
for item in items {
match item {
DocItem::Paragraph(ref text_item) if text_item.label == DocItemLabel::Paragraph => {
if let Some(ref mut text) = current_text {
text.push(' ');
text.push_str(&text_item.text);
} else {
current_text = Some(text_item.text.clone());
current_label = Some(text_item.label);
}
}
_ => {
if let Some(text) = current_text.take() {
merged.push(DocItem::Paragraph(TextItem {
text,
formatting: None,
label: current_label.unwrap_or(DocItemLabel::Paragraph),
}));
}
merged.push(item);
}
}
}
if let Some(text) = current_text {
merged.push(DocItem::Paragraph(TextItem {
text,
formatting: None,
label: current_label.unwrap_or(DocItemLabel::Paragraph),
}));
}
Ok(merged)
}
}
impl Default for PageAssembler {
fn default() -> Self {
Self::new(PageAssemblerOptions::default())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_list_marker() {
let assembler = PageAssembler::default();
let (marker, enumerated) = assembler.detect_list_marker("- Item");
assert_eq!(marker, Some("-".to_string()));
assert!(!enumerated);
let (marker, enumerated) = assembler.detect_list_marker("1. First");
assert_eq!(marker, Some("1.".to_string()));
assert!(enumerated);
let (_marker, enumerated) = assembler.detect_list_marker("• Bullet");
assert!(!enumerated);
}
#[test]
fn test_detect_code_language() {
let assembler = PageAssembler::default();
assert_eq!(
assembler.detect_code_language("def main():\n print('hello')"),
Some("python".to_string())
);
assert_eq!(
assembler.detect_code_language("function test() { const x = 1; }"),
Some("javascript".to_string())
);
assert_eq!(
assembler.detect_code_language("fn main() { println!(\"hello\"); }"),
Some("rust".to_string())
);
}
}