1use std::collections::HashMap;
2
3use crate::ast::{Document, DocumentMetadata, Node, Span, Warning};
4use crate::parser::{strip_subset_prefix, FontInfo, RawTextSegment};
5
6#[derive(Debug, Clone, PartialEq)]
10pub struct DetectorConfig {
11 pub heading_size_ratio: f32,
14 pub detect_bold: bool,
17 pub detect_italic: bool,
20}
21
22impl Default for DetectorConfig {
23 fn default() -> Self {
24 Self {
25 heading_size_ratio: 1.2,
26 detect_bold: true,
27 detect_italic: true,
28 }
29 }
30}
31
32#[derive(Debug, Clone, PartialEq)]
34pub struct ClassifiedSegment {
35 pub segment: RawTextSegment,
36 pub classification: SegmentClass,
37}
38
39#[derive(Debug, Clone, PartialEq)]
41pub enum SegmentClass {
42 Heading(u8),
44 Body,
46}
47
48pub fn compute_body_size(segments: &[RawTextSegment]) -> f32 {
54 if segments.is_empty() {
55 return 12.0;
56 }
57
58 let mut counts: HashMap<i32, usize> = HashMap::new();
59 for segment in segments {
60 let key = (segment.font_size * 100.0).round() as i32;
61 *counts.entry(key).or_insert(0) += 1;
62 }
63
64 let mut best_key = 1200;
68 let mut best_count = 0usize;
69
70 for (key, count) in counts {
71 if count > best_count || (count == best_count && key < best_key) {
72 best_key = key;
73 best_count = count;
74 }
75 }
76
77 best_key as f32 / 100.0
78}
79
80pub fn detect_headings(
92 segments: Vec<RawTextSegment>,
93 body_size: f32,
94 heading_size_ratio: f32,
95) -> Vec<ClassifiedSegment> {
96 let safe_body = if body_size > 0.0 { body_size } else { 12.0 };
97
98 segments
99 .into_iter()
100 .map(|segment| {
101 let ratio = segment.font_size / safe_body;
102 let classification = if ratio >= 2.0 {
103 SegmentClass::Heading(1)
105 } else if ratio >= 1.7 {
106 SegmentClass::Heading(2)
108 } else if ratio >= 1.4 {
109 SegmentClass::Heading(3)
111 } else if ratio >= heading_size_ratio {
112 SegmentClass::Heading(4)
114 } else {
115 SegmentClass::Body
116 };
117
118 ClassifiedSegment {
119 segment,
120 classification,
121 }
122 })
123 .collect()
124}
125
126pub fn detect_formatting(font_name: &str, font_info: &FontInfo) -> (bool, bool) {
138 let stripped = strip_subset_prefix(font_name);
140 let normalized = stripped.to_lowercase();
141
142 let has_bold_combo = normalized.contains("bolditalic") || normalized.contains("boldoblique");
145 let mut bold = has_bold_combo || normalized.contains("bold");
146 let mut italic =
147 has_bold_combo || normalized.contains("italic") || normalized.contains("oblique");
148
149 if !bold {
150 bold = font_info.font_weight.map(|w| w > 600.0).unwrap_or(false);
151 }
152
153 if !italic {
154 italic = font_info
155 .italic_angle
156 .map(|angle| angle.abs() > f32::EPSILON)
157 .unwrap_or(false);
158 }
159
160 (bold, italic)
161}
162
163fn flush_group(kind: &Option<SegmentClass>, spans: Vec<Span>, nodes: &mut Vec<Node>) {
167 if spans.is_empty() {
168 return;
169 }
170 match kind {
171 Some(SegmentClass::Heading(level)) => {
172 nodes.push(Node::Heading {
173 level: *level,
174 spans,
175 });
176 }
177 _ => {
178 nodes.push(Node::Paragraph { spans });
179 }
180 }
181}
182
183pub fn build_document(
195 segments: Vec<RawTextSegment>,
196 fonts: &HashMap<Vec<u8>, FontInfo>,
197 config: &DetectorConfig,
198 metadata: DocumentMetadata,
199) -> (Document, Vec<Warning>) {
200 let mut warnings = Vec::new();
201 let body_size = compute_body_size(&segments);
202 let classified = detect_headings(segments, body_size, config.heading_size_ratio);
203
204 let mut nodes = Vec::new();
205 let mut current_kind: Option<SegmentClass> = None;
206 let mut current_spans: Vec<Span> = Vec::new();
207
208 for item in classified {
209 let font = match fonts.get(&item.segment.font_resource_name) {
210 Some(font) => font,
211 None => {
212 flush_group(
215 ¤t_kind,
216 std::mem::take(&mut current_spans),
217 &mut nodes,
218 );
219 current_kind = None;
220
221 warnings.push(Warning::MissingFontMetrics {
222 font_name: String::from_utf8_lossy(&item.segment.font_resource_name)
223 .to_string(),
224 page: item.segment.page_number,
225 });
226 nodes.push(Node::RawText(item.segment.text));
227 continue;
228 }
229 };
230
231 let (mut bold, mut italic) = detect_formatting(&font.name, font);
232 if !config.detect_bold {
233 bold = false;
234 }
235 if !config.detect_italic {
236 italic = false;
237 }
238
239 let span = Span {
240 text: item.segment.text,
241 bold,
242 italic,
243 font_size: item.segment.font_size,
244 font_name: Some(font.name.clone()),
245 };
246
247 let same_class = match (¤t_kind, &item.classification) {
249 (Some(SegmentClass::Heading(a)), SegmentClass::Heading(b)) => a == b,
250 (Some(SegmentClass::Body), SegmentClass::Body) => true,
251 (None, _) => true, _ => false,
253 };
254
255 if !same_class {
256 flush_group(
257 ¤t_kind,
258 std::mem::take(&mut current_spans),
259 &mut nodes,
260 );
261 }
262
263 current_kind = Some(item.classification);
264 current_spans.push(span);
265 }
266
267 flush_group(
269 ¤t_kind,
270 std::mem::take(&mut current_spans),
271 &mut nodes,
272 );
273
274 (Document { metadata, nodes }, warnings)
275}
276
277#[cfg(test)]
278mod tests {
279 use super::*;
280
281 fn seg(text: &str, font_size: f32) -> RawTextSegment {
282 RawTextSegment {
283 text: text.to_string(),
284 font_resource_name: b"F1".to_vec(),
285 font_size,
286 page_number: 1,
287 }
288 }
289
290 fn seg_with_font(
291 text: &str,
292 font_resource: &[u8],
293 font_size: f32,
294 page: usize,
295 ) -> RawTextSegment {
296 RawTextSegment {
297 text: text.to_string(),
298 font_resource_name: font_resource.to_vec(),
299 font_size,
300 page_number: page,
301 }
302 }
303
304 fn font_info(name: &str, font_weight: Option<f32>, italic_angle: Option<f32>) -> FontInfo {
305 FontInfo {
306 name: name.to_string(),
307 size: None,
308 font_weight,
309 italic_angle,
310 }
311 }
312
313 fn map_fonts<const N: usize>(entries: [(Vec<u8>, FontInfo); N]) -> HashMap<Vec<u8>, FontInfo> {
314 entries.into_iter().collect()
315 }
316
317 #[test]
320 fn compute_body_size_uses_mode_with_smaller_tie_breaker() {
321 let segments = vec![
323 seg("a", 12.0),
324 seg("b", 12.0),
325 seg("c", 14.0),
326 seg("d", 14.0),
327 seg("e", 10.0),
328 seg("f", 10.0),
329 ];
330
331 assert_eq!(compute_body_size(&segments), 10.0);
332 }
333
334 #[test]
335 fn compute_body_size_returns_default_on_empty_segments() {
336 assert_eq!(compute_body_size(&[]), 12.0);
337 }
338
339 #[test]
342 fn detect_headings_maps_ratios_to_levels_and_boundaries() {
343 let body = 10.0;
344 let segments = vec![
346 seg("h1", 20.0), seg("h2", 17.0), seg("h3", 14.0), seg("h4", 12.0), seg("body", 11.99), ];
352
353 let classes = detect_headings(segments, body, 1.2)
354 .into_iter()
355 .map(|c| c.classification)
356 .collect::<Vec<_>>();
357
358 assert_eq!(classes[0], SegmentClass::Heading(1));
359 assert_eq!(classes[1], SegmentClass::Heading(2));
360 assert_eq!(classes[2], SegmentClass::Heading(3));
361 assert_eq!(classes[3], SegmentClass::Heading(4));
362 assert_eq!(classes[4], SegmentClass::Body);
363 }
364
365 #[test]
368 fn detect_formatting_reads_font_name_patterns_and_subset_prefix() {
369 let info = font_info("ignored", None, None);
370
371 assert_eq!(detect_formatting("Arial-Bold", &info), (true, false));
372 assert_eq!(
373 detect_formatting("TimesNewRoman-Italic", &info),
374 (false, true)
375 );
376 assert_eq!(
377 detect_formatting("ABCDEF+Helvetica-BoldOblique", &info),
378 (true, true)
379 );
380 }
381
382 #[test]
383 fn detect_formatting_falls_back_to_descriptor_metrics() {
384 let info = font_info("mystery-font", Some(700.0), Some(-10.0));
385 assert_eq!(detect_formatting("CustomFont-Regular", &info), (true, true));
386 }
387
388 #[test]
391 fn build_document_groups_consecutive_classification_and_preserves_spans() {
392 let segments = vec![
393 seg_with_font("Chapter 1", b"F1", 24.0, 1),
394 seg_with_font("Intro", b"F1", 24.0, 1),
395 seg_with_font("Body A", b"F2", 12.0, 1),
396 seg_with_font("Body B", b"F2", 12.0, 1),
397 ];
398
399 let fonts = map_fonts([
400 (
401 b"F1".to_vec(),
402 font_info("Helvetica-Bold", Some(700.0), None),
403 ),
404 (b"F2".to_vec(), font_info("Helvetica", None, None)),
405 ]);
406
407 let cfg = DetectorConfig::default();
408 let metadata = DocumentMetadata {
409 title: Some("Demo".to_string()),
410 author: None,
411 page_count: 1,
412 };
413
414 let (doc, warnings) = build_document(segments, &fonts, &cfg, metadata.clone());
415
416 assert!(warnings.is_empty());
417 assert_eq!(doc.metadata, metadata);
418 assert_eq!(doc.nodes.len(), 2);
419
420 match &doc.nodes[0] {
422 Node::Heading { level, spans } => {
423 assert_eq!(*level, 1);
424 assert_eq!(spans.len(), 2);
425 assert_eq!(spans[0].text, "Chapter 1");
426 assert_eq!(spans[1].text, "Intro");
427 assert!(spans[0].bold);
428 }
429 other => panic!("expected Heading, got {:?}", other),
430 }
431
432 match &doc.nodes[1] {
434 Node::Paragraph { spans } => {
435 assert_eq!(spans.len(), 2);
436 assert_eq!(spans[0].text, "Body A");
437 assert_eq!(spans[1].text, "Body B");
438 assert!(!spans[0].bold);
439 }
440 other => panic!("expected Paragraph, got {:?}", other),
441 }
442 }
443
444 #[test]
445 fn build_document_uses_raw_text_when_font_is_missing() {
446 let segments = vec![seg_with_font("Unknown font", b"FX", 12.0, 1)];
447 let cfg = DetectorConfig::default();
448
449 let (doc, warnings) = build_document(
450 segments,
451 &HashMap::new(),
452 &cfg,
453 DocumentMetadata {
454 title: None,
455 author: None,
456 page_count: 1,
457 },
458 );
459
460 assert_eq!(doc.nodes, vec![Node::RawText("Unknown font".to_string())]);
461 assert_eq!(warnings.len(), 1);
462 assert!(matches!(warnings[0], Warning::MissingFontMetrics { .. }));
463 }
464}