Skip to main content

pdfvec/
document.rs

1//! Document types for extracted PDF content.
2
3use crate::Page;
4
5/// A fully extracted PDF document.
6///
7/// Contains all pages and provides convenient access patterns.
8///
9/// # Example
10///
11/// ```no_run
12/// use pdfvec::{Extractor, Result};
13///
14/// fn main() -> Result<()> {
15///     let data = std::fs::read("document.pdf")?;
16///     let doc = Extractor::new().extract_document(&data)?;
17///
18///     println!("Extracted {} pages", doc.page_count());
19///     for page in doc.pages() {
20///         println!("Page {}: {} chars", page.number(), page.char_count());
21///     }
22///     Ok(())
23/// }
24/// ```
25#[derive(Debug, Clone)]
26pub struct Document {
27    pages: Vec<Page>,
28}
29
30impl Document {
31    pub(crate) fn new(pages: Vec<Page>) -> Self {
32        Self { pages }
33    }
34
35    /// Returns the number of pages in the document.
36    #[must_use]
37    pub fn page_count(&self) -> usize {
38        self.pages.len()
39    }
40
41    /// Returns true if the document has no pages with text.
42    #[must_use]
43    pub fn is_empty(&self) -> bool {
44        self.pages.is_empty()
45    }
46
47    /// Returns an iterator over the pages.
48    pub fn pages(&self) -> impl Iterator<Item = &Page> {
49        self.pages.iter()
50    }
51
52    /// Returns an iterator over the pages (same as [`pages`](Document::pages)).
53    pub fn iter(&self) -> std::slice::Iter<'_, Page> {
54        self.pages.iter()
55    }
56
57    /// Returns a specific page by 1-indexed number.
58    ///
59    /// Returns `None` if the page number is out of range.
60    #[must_use]
61    pub fn page(&self, number: u32) -> Option<&Page> {
62        if number == 0 {
63            return None;
64        }
65        self.pages.iter().find(|p| p.number() == number)
66    }
67
68    /// Consumes the document and returns all pages.
69    #[must_use]
70    pub fn into_pages(self) -> Vec<Page> {
71        self.pages
72    }
73
74    /// Returns all text concatenated with the given separator.
75    #[must_use]
76    pub fn text(&self, separator: &str) -> String {
77        self.pages
78            .iter()
79            .filter(|p| !p.is_empty())
80            .map(Page::text)
81            .collect::<Vec<_>>()
82            .join(separator)
83    }
84
85    /// Returns all text concatenated with default separator (double newline).
86    #[must_use]
87    pub fn full_text(&self) -> String {
88        self.text("\n\n")
89    }
90
91    /// Returns total character count across all pages.
92    #[must_use]
93    pub fn total_chars(&self) -> usize {
94        self.pages.iter().map(Page::char_count).sum()
95    }
96}
97
98impl IntoIterator for Document {
99    type Item = Page;
100    type IntoIter = std::vec::IntoIter<Page>;
101
102    fn into_iter(self) -> Self::IntoIter {
103        self.pages.into_iter()
104    }
105}
106
107impl<'a> IntoIterator for &'a Document {
108    type Item = &'a Page;
109    type IntoIter = std::slice::Iter<'a, Page>;
110
111    fn into_iter(self) -> Self::IntoIter {
112        self.pages.iter()
113    }
114}
115
116#[cfg(test)]
117mod tests {
118    use super::*;
119
120    fn sample_doc() -> Document {
121        Document::new(vec![
122            Page::new(1, "First page".to_string()),
123            Page::new(2, "Second page".to_string()),
124            Page::new(3, String::new()),
125        ])
126    }
127
128    #[test]
129    fn page_count() {
130        let doc = sample_doc();
131        assert_eq!(doc.page_count(), 3);
132    }
133
134    #[test]
135    fn get_page_by_number() {
136        let doc = sample_doc();
137        assert_eq!(doc.page(1).map(Page::text), Some("First page"));
138        assert_eq!(doc.page(2).map(Page::text), Some("Second page"));
139        assert!(doc.page(0).is_none());
140        assert!(doc.page(99).is_none());
141    }
142
143    #[test]
144    fn full_text_skips_empty() {
145        let doc = sample_doc();
146        assert_eq!(doc.full_text(), "First page\n\nSecond page");
147    }
148
149    #[test]
150    fn custom_separator() {
151        let doc = sample_doc();
152        assert_eq!(doc.text(" | "), "First page | Second page");
153    }
154
155    #[test]
156    fn total_chars() {
157        let doc = sample_doc();
158        assert_eq!(doc.total_chars(), 10 + 11);
159    }
160
161    #[test]
162    fn into_iterator() {
163        let doc = sample_doc();
164        let texts: Vec<_> = doc.into_iter().map(|p| p.into_text()).collect();
165        assert_eq!(texts, vec!["First page", "Second page", ""]);
166    }
167
168    #[test]
169    fn ref_iterator() {
170        let doc = sample_doc();
171        let count = (&doc).into_iter().count();
172        assert_eq!(count, 3);
173    }
174
175    #[test]
176    fn empty_document() {
177        let doc = Document::new(vec![]);
178        assert!(doc.is_empty());
179        assert_eq!(doc.page_count(), 0);
180        assert_eq!(doc.full_text(), "");
181    }
182}