use serde::{Deserialize, Serialize};
use super::bbox::BoundingBox;
use super::chunks::ImageChunk;
use super::chunks::LineArtChunk;
use super::content::ContentElement;
use super::enums::{SemanticType, TextFormat};
use super::table::TableBorder;
use super::text::TextColumn;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SemanticTextNode {
pub bbox: BoundingBox,
pub index: Option<u32>,
pub level: Option<String>,
pub semantic_type: SemanticType,
pub correct_semantic_score: Option<f64>,
pub columns: Vec<TextColumn>,
pub font_weight: Option<f64>,
pub font_size: Option<f64>,
pub text_color: Option<Vec<f64>>,
pub italic_angle: Option<f64>,
pub font_name: Option<String>,
pub text_format: Option<TextFormat>,
pub max_font_size: Option<f64>,
pub background_color: Option<Vec<f64>>,
pub is_hidden_text: bool,
}
impl SemanticTextNode {
pub fn value(&self) -> String {
self.columns
.iter()
.map(|c| c.value())
.collect::<Vec<_>>()
.join("\n")
}
pub fn lines_number(&self) -> usize {
self.columns
.iter()
.flat_map(|c| &c.text_blocks)
.map(|b| b.text_lines.len())
.sum()
}
pub fn columns_number(&self) -> usize {
self.columns.len()
}
pub fn is_empty(&self) -> bool {
self.value().trim().is_empty()
}
pub fn is_space_node(&self) -> bool {
self.value().chars().all(|c| c.is_whitespace())
}
pub fn starts_with_arabic_number(&self) -> bool {
let text = self.value();
let trimmed = text.trim_start();
trimmed.starts_with(|c: char| c.is_ascii_digit())
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SemanticParagraph {
pub base: SemanticTextNode,
pub enclosed_top: bool,
pub enclosed_bottom: bool,
pub indentation: i32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SemanticHeading {
pub base: SemanticParagraph,
pub heading_level: Option<u32>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SemanticNumberHeading {
pub base: SemanticHeading,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SemanticCaption {
pub base: SemanticTextNode,
pub linked_content_id: Option<u64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SemanticHeaderOrFooter {
pub bbox: BoundingBox,
pub index: Option<u32>,
pub level: Option<String>,
pub semantic_type: SemanticType,
pub contents: Vec<ContentElement>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SemanticFigure {
pub bbox: BoundingBox,
pub index: Option<u32>,
pub level: Option<String>,
pub semantic_type: SemanticType,
pub images: Vec<ImageChunk>,
pub line_arts: Vec<LineArtChunk>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SemanticTable {
pub bbox: BoundingBox,
pub index: Option<u32>,
pub level: Option<String>,
pub semantic_type: SemanticType,
pub table_border: TableBorder,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SemanticFormula {
pub bbox: BoundingBox,
pub index: Option<u32>,
pub level: Option<String>,
pub latex: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SemanticPicture {
pub bbox: BoundingBox,
pub index: Option<u32>,
pub level: Option<String>,
pub image_index: u32,
pub description: String,
}
#[cfg(test)]
mod tests {
use super::*;
fn make_empty_text_node() -> SemanticTextNode {
SemanticTextNode {
bbox: BoundingBox::new(Some(1), 0.0, 0.0, 100.0, 12.0),
index: None,
level: None,
semantic_type: SemanticType::Paragraph,
correct_semantic_score: None,
columns: vec![],
font_weight: None,
font_size: None,
text_color: None,
italic_angle: None,
font_name: None,
text_format: None,
max_font_size: None,
background_color: None,
is_hidden_text: false,
}
}
#[test]
fn test_empty_text_node() {
let node = make_empty_text_node();
assert!(node.is_empty());
assert!(node.is_space_node());
assert_eq!(node.lines_number(), 0);
assert_eq!(node.columns_number(), 0);
}
}