oxirs_vec/content_processing/
pdf_handler.rs1#[cfg(feature = "content-processing")]
6use crate::content_processing::{
7 ContentExtractionConfig, ContentLocation, DocumentFormat, DocumentStructure, ExtractedContent,
8 ExtractedImage, ExtractedLink, ExtractedTable, FormatHandler, Heading, ProcessingStats,
9 TocEntry,
10};
11#[cfg(feature = "content-processing")]
12use anyhow::{anyhow, Result};
13#[cfg(feature = "content-processing")]
14use std::collections::HashMap;
15
16#[cfg(feature = "content-processing")]
18pub struct PdfHandler;
19
20#[cfg(feature = "content-processing")]
21impl FormatHandler for PdfHandler {
22 fn extract_content(
23 &self,
24 data: &[u8],
25 config: &ContentExtractionConfig,
26 ) -> Result<ExtractedContent> {
27 let text = match pdf_extract::extract_text_from_mem(data) {
28 Ok(extracted_text) => {
29 if extracted_text.trim().is_empty() {
30 return Err(anyhow!("No text content found in PDF"));
31 }
32 extracted_text
33 }
34 Err(e) => {
35 return Err(anyhow!("Failed to extract text from PDF: {}", e));
36 }
37 };
38
39 let mut metadata = HashMap::new();
41 metadata.insert("format".to_string(), "PDF".to_string());
42 metadata.insert("size".to_string(), data.len().to_string());
43 metadata.insert("extraction_method".to_string(), "pdf-extract".to_string());
44
45 if let Some(pdf_metadata) = self.extract_pdf_metadata(data) {
47 for (key, value) in pdf_metadata {
48 metadata.insert(key, value);
49 }
50 }
51
52 let estimated_pages = text.matches("\x0C").count().max(1); let headings = self.extract_pdf_headings(&text);
57 let tables = if config.extract_tables {
58 self.extract_pdf_tables(&text)
59 } else {
60 Vec::new()
61 };
62
63 let links = if config.extract_links {
64 self.extract_pdf_links(&text)
65 } else {
66 Vec::new()
67 };
68
69 let toc = self.generate_table_of_contents(&headings);
71
72 let images = if config.extract_images {
74 self.extract_pdf_images(data, config).unwrap_or_default()
75 } else {
76 Vec::new()
77 };
78
79 Ok(ExtractedContent {
80 format: DocumentFormat::Pdf,
81 text: text.trim().to_string(),
82 metadata,
83 images,
84 tables,
85 links,
86 structure: DocumentStructure {
87 title: self.extract_pdf_title(&text),
88 headings: headings.clone(),
89 page_count: estimated_pages,
90 section_count: headings.len().max(1),
91 table_of_contents: toc,
92 },
93 chunks: Vec::new(),
94 language: None,
95 processing_stats: ProcessingStats::default(),
96 audio_content: Vec::new(),
97 video_content: Vec::new(),
98 cross_modal_embeddings: Vec::new(),
99 })
100 }
101
102 fn can_handle(&self, data: &[u8]) -> bool {
103 data.len() >= 4 && &data[0..4] == b"%PDF"
104 }
105
106 fn supported_extensions(&self) -> Vec<&'static str> {
107 vec!["pdf"]
108 }
109}
110
111#[cfg(feature = "content-processing")]
112impl PdfHandler {
113 fn extract_pdf_title(&self, text: &str) -> Option<String> {
114 let lines: Vec<&str> = text.lines().take(10).collect();
116 for line in lines {
117 let trimmed = line.trim();
118 if trimmed.len() > 5
119 && trimmed.len() < 100
120 && !trimmed.contains("http")
121 && !trimmed.contains("www")
122 {
123 return Some(trimmed.to_string());
125 }
126 }
127 None
128 }
129
130 fn extract_pdf_headings(&self, text: &str) -> Vec<Heading> {
131 let mut headings = Vec::new();
132
133 let lines: Vec<&str> = text.lines().collect();
135 for (i, line) in lines.iter().enumerate() {
136 let trimmed = line.trim();
137 if trimmed.len() > 5 && trimmed.len() < 80 {
138 let words: Vec<&str> = trimmed.split_whitespace().collect();
140 let capitalized_words = words
141 .iter()
142 .filter(|w| w.chars().next().is_some_and(|c| c.is_uppercase()))
143 .count();
144
145 if capitalized_words >= words.len() / 2 && words.len() <= 10 {
146 headings.push(Heading {
147 level: 1, text: trimmed.to_string(),
149 location: ContentLocation {
150 page: None,
151 section: None,
152 char_offset: None,
153 line: Some(i + 1),
154 column: None,
155 },
156 });
157 }
158 }
159 }
160
161 headings
162 }
163
164 fn extract_pdf_tables(&self, text: &str) -> Vec<ExtractedTable> {
166 let mut tables = Vec::new();
167 let lines: Vec<&str> = text.lines().collect();
168
169 let mut current_table: Vec<Vec<String>> = Vec::new();
170 let mut in_table = false;
171
172 for (i, line) in lines.iter().enumerate() {
173 let trimmed = line.trim();
174
175 let parts: Vec<&str> = trimmed.split_whitespace().collect();
177
178 if parts.len() >= 2 && parts.len() <= 8 {
179 let has_numbers = parts.iter().any(|p| p.parse::<f64>().is_ok());
181 let consistent_spacing =
182 trimmed.contains('\t') || trimmed.matches(" ").count() >= 2;
183
184 if has_numbers || consistent_spacing {
185 if !in_table {
186 in_table = true;
187 current_table.clear();
188 }
189
190 let row: Vec<String> = parts.iter().map(|s| s.to_string()).collect();
191 current_table.push(row);
192 } else if in_table && current_table.len() >= 2 {
193 tables.push(ExtractedTable {
195 headers: if current_table.len() > 1 {
196 current_table[0].clone()
197 } else {
198 Vec::new()
199 },
200 rows: current_table[1..].to_vec(),
201 caption: None,
202 location: ContentLocation {
203 page: None,
204 section: None,
205 char_offset: None,
206 line: Some(i + 1),
207 column: None,
208 },
209 });
210
211 in_table = false;
212 current_table.clear();
213 }
214 } else if in_table {
215 if current_table.len() >= 2 {
217 tables.push(ExtractedTable {
218 headers: if current_table.len() > 1 {
219 current_table[0].clone()
220 } else {
221 Vec::new()
222 },
223 rows: current_table[1..].to_vec(),
224 caption: None,
225 location: ContentLocation {
226 page: None,
227 section: None,
228 char_offset: None,
229 line: Some(i + 1),
230 column: None,
231 },
232 });
233 }
234
235 in_table = false;
236 current_table.clear();
237 }
238 }
239
240 if in_table && current_table.len() >= 2 {
242 tables.push(ExtractedTable {
243 headers: if current_table.len() > 1 {
244 current_table[0].clone()
245 } else {
246 Vec::new()
247 },
248 rows: current_table[1..].to_vec(),
249 caption: None,
250 location: ContentLocation {
251 page: None,
252 section: None,
253 char_offset: None,
254 line: Some(lines.len()),
255 column: None,
256 },
257 });
258 }
259
260 tables
261 }
262
263 fn extract_pdf_links(&self, text: &str) -> Vec<ExtractedLink> {
265 let mut links = Vec::new();
266
267 let url_regex = regex::Regex::new(r"https?://[^\s\)]+").unwrap();
269 let email_regex =
270 regex::Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap();
271
272 for mat in url_regex.find_iter(text) {
274 let url = mat
275 .as_str()
276 .trim_end_matches(&['.', ',', ')', ']', '}'][..]);
277 links.push(ExtractedLink {
278 url: url.to_string(),
279 text: url.to_string(),
280 title: None,
281 location: ContentLocation {
282 page: None,
283 section: None,
284 char_offset: None,
285 line: None,
286 column: None,
287 },
288 });
289 }
290
291 for mat in email_regex.find_iter(text) {
293 let email = mat.as_str();
294 links.push(ExtractedLink {
295 url: format!("mailto:{}", email),
296 text: email.to_string(),
297 title: None,
298 location: ContentLocation {
299 page: None,
300 section: None,
301 char_offset: None,
302 line: None,
303 column: None,
304 },
305 });
306 }
307
308 links
309 }
310
311 fn extract_pdf_metadata(&self, data: &[u8]) -> Option<HashMap<String, String>> {
313 let mut metadata = HashMap::new();
314
315 let content = String::from_utf8_lossy(data).into_owned();
317
318 if let Some(title_match) = regex::Regex::new(r"/Title\s*\(\s*([^)]+)\s*\)")
320 .unwrap()
321 .captures(&content)
322 {
323 if let Some(title) = title_match.get(1) {
324 metadata.insert("title".to_string(), title.as_str().to_string());
325 }
326 }
327
328 if let Some(author_match) = regex::Regex::new(r"/Author\s*\(\s*([^)]+)\s*\)")
330 .unwrap()
331 .captures(&content)
332 {
333 if let Some(author) = author_match.get(1) {
334 metadata.insert("author".to_string(), author.as_str().to_string());
335 }
336 }
337
338 if let Some(subject_match) = regex::Regex::new(r"/Subject\s*\(\s*([^)]+)\s*\)")
340 .unwrap()
341 .captures(&content)
342 {
343 if let Some(subject) = subject_match.get(1) {
344 metadata.insert("subject".to_string(), subject.as_str().to_string());
345 }
346 }
347
348 if let Some(date_match) = regex::Regex::new(r"/CreationDate\s*\(\s*([^)]+)\s*\)")
350 .unwrap()
351 .captures(&content)
352 {
353 if let Some(date) = date_match.get(1) {
354 metadata.insert("creation_date".to_string(), date.as_str().to_string());
355 }
356 }
357
358 if metadata.is_empty() {
359 None
360 } else {
361 Some(metadata)
362 }
363 }
364
365 fn extract_pdf_images(
367 &self,
368 _data: &[u8],
369 config: &ContentExtractionConfig,
370 ) -> Result<Vec<ExtractedImage>> {
371 if config.extract_images {
372 Ok(Vec::new())
376 } else {
377 Ok(Vec::new())
378 }
379 }
380
381 fn generate_table_of_contents(&self, headings: &[Heading]) -> Vec<TocEntry> {
383 headings
384 .iter()
385 .map(|heading| TocEntry {
386 title: heading.text.clone(),
387 level: heading.level,
388 page: heading.location.page,
389 location: heading.location.clone(),
390 })
391 .collect()
392 }
393}