use std::borrow::Cow;
use std::fmt;
use ahash::AHashMap;
use super::document_structure::{ContentLayer, TextAnnotation};
use super::extraction::BoundingBox;
use super::metadata::Metadata;
use super::ocr_elements::{OcrBoundingGeometry, OcrConfidence, OcrElementLevel, OcrRotation};
use super::tables::Table;
use crate::types::ExtractedImage;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct InternalElementId([u8; 15]);
impl InternalElementId {
pub fn generate(kind_discriminant: &str, text: &str, page: Option<u32>, index: u32) -> Self {
let mut hasher = blake3::Hasher::new();
hasher.update(kind_discriminant.as_bytes());
hasher.update(text.as_bytes());
hasher.update(&page.unwrap_or(u32::MAX).to_le_bytes());
hasher.update(&index.to_le_bytes());
let hash = hasher.finalize();
let bytes = &hash.as_bytes()[..6];
let mut buf = [0u8; 15];
buf[0] = b'i';
buf[1] = b'e';
buf[2] = b'-';
hex::encode_to_slice(bytes, &mut buf[3..]).expect("fixed size");
Self(buf)
}
#[allow(dead_code)]
pub(crate) fn new(id: &str) -> Self {
assert!(
id.len() == 15,
"InternalElementId must be exactly 15 bytes, got {}",
id.len()
);
let mut buf = [0u8; 15];
buf.copy_from_slice(id.as_bytes());
Self(buf)
}
pub fn as_str(&self) -> &str {
std::str::from_utf8(&self.0).unwrap()
}
}
impl fmt::Display for InternalElementId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(self.as_str())
}
}
impl AsRef<str> for InternalElementId {
fn as_ref(&self) -> &str {
self.as_str()
}
}
#[derive(Debug, Clone)]
pub struct InternalDocument {
pub elements: Vec<InternalElement>,
pub relationships: Vec<Relationship>,
pub source_format: Cow<'static, str>,
pub metadata: Metadata,
pub images: Vec<ExtractedImage>,
pub tables: Vec<Table>,
pub uris: Vec<super::uri::Uri>,
pub children: Option<Vec<crate::types::ArchiveEntry>>,
pub mime_type: Cow<'static, str>,
pub processing_warnings: Vec<crate::types::ProcessingWarning>,
pub annotations: Option<Vec<crate::types::annotations::PdfAnnotation>>,
pub prebuilt_pages: Option<Vec<crate::types::PageContent>>,
pub pre_rendered_content: Option<String>,
pub prebuilt_ocr_elements: Option<Vec<crate::types::ocr_elements::OcrElement>>,
}
impl InternalDocument {
pub fn new(source_format: impl Into<Cow<'static, str>>) -> Self {
Self {
elements: Vec::new(),
relationships: Vec::new(),
source_format: source_format.into(),
metadata: Metadata::default(),
images: Vec::new(),
tables: Vec::new(),
uris: Vec::new(),
children: None,
mime_type: Cow::Borrowed("application/octet-stream"),
processing_warnings: Vec::new(),
annotations: None,
prebuilt_pages: None,
pre_rendered_content: None,
prebuilt_ocr_elements: None,
}
}
pub fn push_element(&mut self, element: InternalElement) -> u32 {
let idx = self.elements.len() as u32;
self.elements.push(element);
idx
}
pub fn push_relationship(&mut self, relationship: Relationship) {
self.relationships.push(relationship);
}
pub fn push_table(&mut self, table: Table) -> u32 {
let idx = self.tables.len() as u32;
self.tables.push(table);
idx
}
pub fn push_image(&mut self, image: ExtractedImage) -> u32 {
let idx = self.images.len() as u32;
self.images.push(image);
idx
}
const MAX_URIS: usize = 100_000;
pub fn push_uri(&mut self, uri: super::uri::Uri) {
if self.uris.len() < Self::MAX_URIS {
self.uris.push(uri);
}
}
pub fn content(&self) -> String {
self.elements
.iter()
.map(|e| e.text.as_str())
.collect::<Vec<_>>()
.join("\n")
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct InternalElement {
pub id: InternalElementId,
pub kind: ElementKind,
pub text: String,
pub depth: u16,
pub page: Option<u32>,
pub bbox: Option<BoundingBox>,
pub layer: ContentLayer,
pub annotations: Vec<TextAnnotation>,
pub attributes: Option<AHashMap<String, String>>,
pub anchor: Option<String>,
pub ocr_geometry: Option<OcrBoundingGeometry>,
pub ocr_confidence: Option<OcrConfidence>,
pub ocr_rotation: Option<OcrRotation>,
}
impl InternalElement {
pub fn text(kind: ElementKind, text: impl Into<String>, depth: u16) -> Self {
let text = text.into();
let id = InternalElementId::generate(kind.discriminant(), &text, None, 0);
Self {
id,
kind,
text,
depth,
page: None,
bbox: None,
layer: ContentLayer::Body,
annotations: Vec::new(),
attributes: None,
anchor: None,
ocr_geometry: None,
ocr_confidence: None,
ocr_rotation: None,
}
}
pub fn with_page(mut self, page: u32) -> Self {
self.page = Some(page);
self
}
pub fn with_bbox(mut self, bbox: BoundingBox) -> Self {
self.bbox = Some(bbox);
self
}
pub fn with_layer(mut self, layer: ContentLayer) -> Self {
self.layer = layer;
self
}
pub fn with_anchor(mut self, anchor: impl Into<String>) -> Self {
self.anchor = Some(anchor.into());
self
}
pub fn with_annotations(mut self, annotations: Vec<TextAnnotation>) -> Self {
self.annotations = annotations;
self
}
pub fn with_attributes(mut self, attributes: AHashMap<String, String>) -> Self {
self.attributes = Some(attributes);
self
}
pub fn with_index(mut self, index: u32) -> Self {
self.id = InternalElementId::generate(self.kind.discriminant(), &self.text, self.page, index);
self
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ElementKind {
Title,
Heading { level: u8 },
Paragraph,
ListItem { ordered: bool },
Code,
Formula,
FootnoteDefinition,
FootnoteRef,
Citation,
Slide { number: u32 },
DefinitionTerm,
DefinitionDescription,
Admonition,
RawBlock,
MetadataBlock,
ListStart { ordered: bool },
ListEnd,
QuoteStart,
QuoteEnd,
GroupStart,
GroupEnd,
Table { table_index: u32 },
Image { image_index: u32 },
PageBreak,
OcrText { level: OcrElementLevel },
}
impl ElementKind {
pub fn discriminant(&self) -> &'static str {
match self {
Self::Title => "title",
Self::Heading { .. } => "heading",
Self::Paragraph => "paragraph",
Self::ListItem { .. } => "list_item",
Self::Code => "code",
Self::Formula => "formula",
Self::FootnoteDefinition => "footnote_definition",
Self::FootnoteRef => "footnote_ref",
Self::Citation => "citation",
Self::Slide { .. } => "slide",
Self::DefinitionTerm => "definition_term",
Self::DefinitionDescription => "definition_description",
Self::Admonition => "admonition",
Self::RawBlock => "raw_block",
Self::MetadataBlock => "metadata_block",
Self::ListStart { .. } => "list_start",
Self::ListEnd => "list_end",
Self::QuoteStart => "quote_start",
Self::QuoteEnd => "quote_end",
Self::GroupStart => "group_start",
Self::GroupEnd => "group_end",
Self::Table { .. } => "table",
Self::Image { .. } => "image",
Self::PageBreak => "page_break",
Self::OcrText { .. } => "ocr_text",
}
}
pub fn is_container_start(&self) -> bool {
matches!(self, Self::ListStart { .. } | Self::QuoteStart | Self::GroupStart)
}
pub fn is_container_end(&self) -> bool {
matches!(self, Self::ListEnd | Self::QuoteEnd | Self::GroupEnd)
}
pub fn matching_end(&self) -> Option<ElementKind> {
match self {
Self::ListStart { .. } => Some(Self::ListEnd),
Self::QuoteStart => Some(Self::QuoteEnd),
Self::GroupStart => Some(Self::GroupEnd),
_ => None,
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct Relationship {
pub source: u32,
pub target: RelationshipTarget,
pub kind: RelationshipKind,
}
#[derive(Debug, Clone, PartialEq)]
pub enum RelationshipTarget {
Index(u32),
Key(String),
}
pub use super::document_structure::RelationshipKind;
const _: () = {
#[allow(dead_code)]
fn assert_send_sync<T: Send + Sync>() {}
#[allow(dead_code)]
fn _check() {
assert_send_sync::<InternalDocument>();
assert_send_sync::<InternalElement>();
}
};
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_internal_element_id_deterministic() {
let id1 = InternalElementId::generate("heading", "Introduction", Some(1), 0);
let id2 = InternalElementId::generate("heading", "Introduction", Some(1), 0);
assert_eq!(id1, id2);
}
#[test]
fn test_internal_element_id_differs_by_index() {
let id1 = InternalElementId::generate("paragraph", "Same text", Some(1), 0);
let id2 = InternalElementId::generate("paragraph", "Same text", Some(1), 1);
assert_ne!(id1, id2);
}
#[test]
fn test_internal_element_id_format() {
let id = InternalElementId::generate("title", "Hello", None, 0);
assert!(id.as_str().starts_with("ie-"));
assert_eq!(id.as_str().len(), 3 + 12); }
#[test]
fn test_element_kind_discriminant() {
assert_eq!(ElementKind::Title.discriminant(), "title");
assert_eq!(ElementKind::Heading { level: 2 }.discriminant(), "heading");
assert_eq!(ElementKind::ListStart { ordered: true }.discriminant(), "list_start");
}
#[test]
fn test_container_markers() {
assert!(ElementKind::ListStart { ordered: false }.is_container_start());
assert!(ElementKind::ListEnd.is_container_end());
assert!(!ElementKind::Paragraph.is_container_start());
assert_eq!(ElementKind::QuoteStart.matching_end(), Some(ElementKind::QuoteEnd));
}
#[test]
fn test_internal_document_push() {
let mut doc = InternalDocument::new("markdown");
let elem = InternalElement::text(ElementKind::Paragraph, "Hello world", 0);
let idx = doc.push_element(elem);
assert_eq!(idx, 0);
assert_eq!(doc.elements.len(), 1);
assert_eq!(doc.elements[0].text, "Hello world");
}
#[test]
fn test_internal_element_builder_pattern() {
let elem = InternalElement::text(ElementKind::Heading { level: 2 }, "Methods", 1)
.with_page(3)
.with_anchor("methods")
.with_layer(ContentLayer::Body);
assert_eq!(elem.text, "Methods");
assert_eq!(elem.page, Some(3));
assert_eq!(elem.anchor, Some("methods".to_string()));
assert_eq!(elem.depth, 1);
}
#[test]
fn test_relationship_kind_serde() {
let kind = RelationshipKind::FootnoteReference;
let json = serde_json::to_string(&kind).unwrap();
assert_eq!(json, "\"footnote_reference\"");
let parsed: RelationshipKind = serde_json::from_str(&json).unwrap();
assert_eq!(parsed, kind);
}
}