Skip to main content

edgeparse_core/pipeline/
parallel.rs

1//! Parallel page-processing utilities.
2//!
3//! On native targets: uses rayon for data parallelism.
4//! On wasm32: falls back to sequential iteration.
5//!
6//! Provides helpers that apply a per-page transformation in parallel across all
7//! pages of a document. Designed as a drop-in replacement for the sequential
8//! `for page in &mut pages { ... }` loops in the orchestrator.
9
10#[cfg(not(target_arch = "wasm32"))]
11use rayon::prelude::*;
12
13use crate::models::content::ContentElement;
14
15/// Per-page content alias (mirrors `orchestrator::PageContent`).
16type PageContent = Vec<ContentElement>;
17
18/// Apply `op` to each page in parallel, replacing each page's content.
19///
20/// This is the parallel equivalent of:
21/// ```ignore
22/// for page in &mut pages {
23///     let elements = std::mem::take(page);
24///     *page = op(elements);
25/// }
26/// ```
27pub fn par_map_pages<F>(pages: &mut Vec<PageContent>, op: F)
28where
29    F: Fn(Vec<ContentElement>) -> Vec<ContentElement> + Sync + Send,
30{
31    #[cfg(not(target_arch = "wasm32"))]
32    {
33        let results: Vec<PageContent> = std::mem::take(pages).into_par_iter().map(&op).collect();
34        *pages = results;
35    }
36    #[cfg(target_arch = "wasm32")]
37    {
38        let results: Vec<PageContent> = std::mem::take(pages).into_iter().map(op).collect();
39        *pages = results;
40    }
41}
42
43/// Apply `op` to each page in parallel where the closure also receives a
44/// zero-based page index.
45pub fn par_map_pages_indexed<F>(pages: &mut Vec<PageContent>, op: F)
46where
47    F: Fn(usize, Vec<ContentElement>) -> Vec<ContentElement> + Sync + Send,
48{
49    #[cfg(not(target_arch = "wasm32"))]
50    {
51        let results: Vec<PageContent> = std::mem::take(pages)
52            .into_par_iter()
53            .enumerate()
54            .map(|(i, page)| op(i, page))
55            .collect();
56        *pages = results;
57    }
58    #[cfg(target_arch = "wasm32")]
59    {
60        let results: Vec<PageContent> = std::mem::take(pages)
61            .into_iter()
62            .enumerate()
63            .map(|(i, page)| op(i, page))
64            .collect();
65        *pages = results;
66    }
67}
68
69/// Parallel fold — map each page to a value of type `T` and collect results.
70///
71/// Useful for gathering per-page statistics or metadata without modifying pages.
72pub fn par_extract<T, F>(pages: &[PageContent], op: F) -> Vec<T>
73where
74    T: Send,
75    F: Fn(&[ContentElement]) -> T + Sync + Send,
76{
77    #[cfg(not(target_arch = "wasm32"))]
78    {
79        pages.par_iter().map(|page| op(page)).collect()
80    }
81    #[cfg(target_arch = "wasm32")]
82    {
83        pages.iter().map(|page| op(page)).collect()
84    }
85}
86
87/// Configure the global rayon thread pool with the given number of threads.
88///
89/// On WASM, this is a no-op.
90#[cfg(not(target_arch = "wasm32"))]
91pub fn configure_thread_pool(num_threads: usize) -> Result<(), rayon::ThreadPoolBuildError> {
92    rayon::ThreadPoolBuilder::new()
93        .num_threads(num_threads)
94        .build_global()
95}
96
97/// Configure the global rayon thread pool with the given number of threads.
98///
99/// On WASM, this is a no-op.
100#[cfg(target_arch = "wasm32")]
101pub fn configure_thread_pool(_num_threads: usize) -> Result<(), String> {
102    Ok(())
103}
104
105#[cfg(test)]
106mod tests {
107    use super::*;
108    use crate::models::bbox::BoundingBox;
109    use crate::models::chunks::TextChunk;
110    use crate::models::content::ContentElement;
111    use crate::models::enums::{PdfLayer, TextFormat, TextType};
112
113    fn text_chunk(val: &str) -> ContentElement {
114        ContentElement::TextChunk(TextChunk {
115            value: val.to_string(),
116            bbox: BoundingBox::new(None, 0.0, 0.0, 100.0, 10.0),
117            font_name: String::new(),
118            font_size: 12.0,
119            font_weight: 400.0,
120            italic_angle: 0.0,
121            font_color: String::new(),
122            contrast_ratio: 21.0,
123            symbol_ends: vec![],
124            text_format: TextFormat::Normal,
125            text_type: TextType::Regular,
126            pdf_layer: PdfLayer::Main,
127            ocg_visible: true,
128            index: None,
129            page_number: None,
130            level: None,
131            mcid: None,
132        })
133    }
134
135    #[test]
136    fn test_par_map_pages_identity() {
137        let mut pages = vec![
138            vec![text_chunk("a"), text_chunk("b")],
139            vec![text_chunk("c")],
140        ];
141        par_map_pages(&mut pages, |elems| elems);
142        assert_eq!(pages.len(), 2);
143        assert_eq!(pages[0].len(), 2);
144        assert_eq!(pages[1].len(), 1);
145    }
146
147    #[test]
148    fn test_par_map_pages_transform() {
149        let mut pages = vec![
150            vec![text_chunk("a"), text_chunk("b"), text_chunk("c")],
151            vec![text_chunk("x")],
152        ];
153        // Keep only first element per page
154        par_map_pages(&mut pages, |mut elems| {
155            elems.truncate(1);
156            elems
157        });
158        assert_eq!(pages[0].len(), 1);
159        assert_eq!(pages[1].len(), 1);
160    }
161
162    #[test]
163    fn test_par_map_pages_indexed() {
164        let mut pages = vec![
165            vec![text_chunk("a")],
166            vec![text_chunk("b")],
167            vec![text_chunk("c")],
168        ];
169        let indices_seen = std::sync::Mutex::new(vec![]);
170        par_map_pages_indexed(&mut pages, |i, elems| {
171            indices_seen.lock().unwrap().push(i);
172            elems
173        });
174        let mut seen = indices_seen.into_inner().unwrap();
175        seen.sort();
176        assert_eq!(seen, vec![0, 1, 2]);
177    }
178
179    #[test]
180    fn test_par_extract() {
181        let pages = vec![
182            vec![text_chunk("a"), text_chunk("b")],
183            vec![text_chunk("c")],
184            vec![],
185        ];
186        let counts: Vec<usize> = par_extract(&pages, |elems| elems.len());
187        assert_eq!(counts, vec![2, 1, 0]);
188    }
189
190    #[test]
191    fn test_empty_pages() {
192        let mut pages: Vec<PageContent> = vec![];
193        par_map_pages(&mut pages, |e| e);
194        assert!(pages.is_empty());
195
196        let counts: Vec<usize> = par_extract(&pages, |e| e.len());
197        assert!(counts.is_empty());
198    }
199}