1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
//! PDF source parser: a byte-exact lossless network plus concept-tagged
//! document-structure links.
//!
//! Parsing starts from [`LinkNetwork::parse_lossless_text`], so every byte of
//! the PDF is preserved as a `Token` leaf and [`LinkNetwork::reconstruct_text`]
//! returns the input verbatim. On top of those leaves this module recovers the
//! document structure of the [text PDF profile](crate::document_formatting) and
//! interns it as additive `Concept`/`Object` links: one shared concept point
//! per role (`heading`, `paragraph`, `bullet-list`, `list-item`, `strong`,
//! `emphasis`, …) and one structure `Object` link per occurrence, referencing
//! both its parent structure node and the concept it instantiates.
//!
//! The structure links are purely additive tags: they carry no spans, are never
//! read back by reconstruction, and only enrich the network with the formatting
//! concepts required by issue #84 for bold/italic/heading/paragraph/list.
use crate::document_formatting::{parse_pdf_document, BlockNode, InlineNode};
use crate::{LinkId, LinkMetadata, LinkNetwork, LinkType, ParseConfiguration};
/// Parses PDF `text` into a lossless network enriched with concept-tagged
/// document-structure links.
pub fn parse(text: &str, language: &str, configuration: ParseConfiguration) -> LinkNetwork {
let mut network = LinkNetwork::parse_lossless_text(text, language, configuration);
let document = parse_pdf_document(text);
if document.blocks.is_empty() {
// Out-of-profile PDF: keep the byte-exact lossless network untagged.
return network;
}
let Some(root) = document_link(&network) else {
return network;
};
for block in &document.blocks {
annotate_block(&mut network, root, block, language);
}
network
}
/// Finds the document root inserted by [`LinkNetwork::parse_lossless_text`].
fn document_link(network: &LinkNetwork) -> Option<LinkId> {
network
.links()
.find(|link| link.metadata().link_type() == Some(LinkType::Document))
.map(crate::Link::id)
}
fn annotate_block(network: &mut LinkNetwork, parent: LinkId, block: &BlockNode, language: &str) {
let structure = insert_structure(network, parent, block.concept_id(), language);
match block {
BlockNode::Heading { children, .. } | BlockNode::Paragraph { children } => {
annotate_inline(network, structure, children, language);
}
BlockNode::List { items, .. } => {
for item in items {
let list_item = insert_structure(network, structure, "list-item", language);
annotate_inline(network, list_item, item, language);
}
}
}
}
fn annotate_inline(
network: &mut LinkNetwork,
parent: LinkId,
nodes: &[InlineNode],
language: &str,
) {
for node in nodes {
if let InlineNode::Wrapped {
concept, children, ..
} = node
{
let structure = insert_structure(network, parent, concept, language);
annotate_inline(network, structure, children, language);
}
}
}
/// Interns the shared concept point for `concept` and inserts a structure
/// `Object` link referencing both its `parent` node and that concept point.
fn insert_structure(
network: &mut LinkNetwork,
parent: LinkId,
concept: &str,
language: &str,
) -> LinkId {
let concept_point = network.insert_typed_point(concept, LinkType::Concept, None);
network.insert_link(
[parent, concept_point],
LinkMetadata::new()
.with_link_type(LinkType::Object)
.with_named(true)
.with_term(concept)
.with_language(language),
)
}