Skip to main content

rpdfium_doc/
page_label.rs

1//! Page label parsing and formatting (ISO 32000-2 section 12.4.2).
2//!
3//! Page labels allow PDF documents to use custom numbering schemes
4//! (e.g., roman numerals for a preface, then decimal for the body).
5
6use rpdfium_core::{Name, PdfSource};
7use rpdfium_parser::{Object, ObjectStore};
8
9use crate::error::{DocError, DocResult};
10use crate::number_tree::NumberTree;
11
12/// A page label specification that applies from a given page index onward.
13#[derive(Debug, Clone)]
14pub struct PageLabel {
15    /// The numbering style (None means no numeric portion).
16    pub style: Option<PageLabelStyle>,
17    /// An optional label prefix string.
18    pub prefix: Option<String>,
19    /// The starting value for this range (default 1).
20    pub start: i64,
21}
22
23/// The numbering style for page labels.
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub enum PageLabelStyle {
26    /// Decimal arabic numerals (1, 2, 3, ...).
27    Decimal,
28    /// Uppercase roman numerals (I, II, III, ...).
29    UpperRoman,
30    /// Lowercase roman numerals (i, ii, iii, ...).
31    LowerRoman,
32    /// Uppercase letters (A, B, ..., Z, AA, AB, ...).
33    UpperAlpha,
34    /// Lowercase letters (a, b, ..., z, aa, ab, ...).
35    LowerAlpha,
36}
37
38/// Parse page labels from the document catalog's `/PageLabels` entry.
39///
40/// Returns a sorted list of `(page_index, PageLabel)` pairs.
41pub fn parse_page_labels<S: PdfSource>(
42    catalog: &Object,
43    store: &ObjectStore<S>,
44) -> DocResult<Vec<(i64, PageLabel)>> {
45    let catalog_dict = store
46        .deep_resolve(catalog)
47        .map_err(|e| DocError::Parser(e.to_string()))?
48        .as_dict()
49        .ok_or(DocError::UnexpectedType)?;
50
51    let labels_obj = match catalog_dict.get(&Name::page_labels()) {
52        Some(obj) => store
53            .deep_resolve(obj)
54            .map_err(|e| DocError::Parser(e.to_string()))?,
55        None => return Ok(Vec::new()),
56    };
57
58    let tree = NumberTree::parse(labels_obj, store, convert_page_label)?;
59    Ok(tree.entries().to_vec())
60}
61
62/// Convert a PDF object (page label dictionary) into a `PageLabel`.
63fn convert_page_label<S: PdfSource>(obj: &Object, store: &ObjectStore<S>) -> DocResult<PageLabel> {
64    let dict = obj.as_dict().ok_or(DocError::UnexpectedType)?;
65
66    // /S (style)
67    let style = dict
68        .get(&Name::s())
69        .and_then(|o| {
70            store
71                .deep_resolve(o)
72                .ok()
73                .and_then(|r| r.as_name().map(|n| n.as_str().into_owned()))
74        })
75        .and_then(|s| match s.as_str() {
76            "D" => Some(PageLabelStyle::Decimal),
77            "R" => Some(PageLabelStyle::UpperRoman),
78            "r" => Some(PageLabelStyle::LowerRoman),
79            "A" => Some(PageLabelStyle::UpperAlpha),
80            "a" => Some(PageLabelStyle::LowerAlpha),
81            _ => None,
82        });
83
84    // /P (prefix)
85    let prefix = dict.get(&Name::p()).and_then(|o| {
86        store
87            .deep_resolve(o)
88            .ok()
89            .and_then(|r| r.as_string().map(|s| s.to_string_lossy()))
90    });
91
92    // /St (start value, default 1)
93    let start = dict
94        .get(&Name::st())
95        .and_then(|o| o.as_i64())
96        .filter(|&n| n >= 1)
97        .unwrap_or(1);
98
99    Ok(PageLabel {
100        style,
101        prefix,
102        start,
103    })
104}
105
106/// Format a page label for the given page offset within its range.
107///
108/// `page_offset` is the 0-based offset from the range's starting page.
109/// The actual label number is `label.start + page_offset`.
110pub fn format_label(label: &PageLabel, page_offset: i64) -> String {
111    let num = label.start + page_offset;
112    let mut result = String::new();
113
114    if let Some(ref prefix) = label.prefix {
115        result.push_str(prefix);
116    }
117
118    if let Some(style) = label.style {
119        let num_str = match style {
120            PageLabelStyle::Decimal => format!("{num}"),
121            PageLabelStyle::UpperRoman => to_roman(num, true),
122            PageLabelStyle::LowerRoman => to_roman(num, false),
123            PageLabelStyle::UpperAlpha => to_alpha(num, true),
124            PageLabelStyle::LowerAlpha => to_alpha(num, false),
125        };
126        result.push_str(&num_str);
127    }
128
129    result
130}
131
132/// Convert a positive integer to roman numerals.
133fn to_roman(mut num: i64, upper: bool) -> String {
134    if num <= 0 {
135        return String::new();
136    }
137
138    let table: &[(i64, &str)] = &[
139        (1000, "M"),
140        (900, "CM"),
141        (500, "D"),
142        (400, "CD"),
143        (100, "C"),
144        (90, "XC"),
145        (50, "L"),
146        (40, "XL"),
147        (10, "X"),
148        (9, "IX"),
149        (5, "V"),
150        (4, "IV"),
151        (1, "I"),
152    ];
153
154    let mut result = String::new();
155    for &(value, symbol) in table {
156        while num >= value {
157            result.push_str(symbol);
158            num -= value;
159        }
160    }
161
162    if upper { result } else { result.to_lowercase() }
163}
164
165/// Convert a positive integer to alphabetic representation.
166/// 1=A, 2=B, ..., 26=Z, 27=AA, 28=AB, ...
167fn to_alpha(num: i64, upper: bool) -> String {
168    if num <= 0 {
169        return String::new();
170    }
171
172    let mut n = num - 1; // 0-based
173    let mut result = Vec::new();
174
175    loop {
176        let remainder = (n % 26) as u8;
177        let base = if upper { b'A' } else { b'a' };
178        result.push(base + remainder);
179        n = n / 26 - 1;
180        if n < 0 {
181            break;
182        }
183    }
184
185    result.reverse();
186    String::from_utf8(result).unwrap_or_default()
187}
188
189#[cfg(test)]
190mod tests {
191    use super::*;
192
193    #[test]
194    fn test_decimal_labels() {
195        let label = PageLabel {
196            style: Some(PageLabelStyle::Decimal),
197            prefix: None,
198            start: 1,
199        };
200        assert_eq!(format_label(&label, 0), "1");
201        assert_eq!(format_label(&label, 4), "5");
202        assert_eq!(format_label(&label, 99), "100");
203    }
204
205    #[test]
206    fn test_upper_roman() {
207        let label = PageLabel {
208            style: Some(PageLabelStyle::UpperRoman),
209            prefix: None,
210            start: 1,
211        };
212        assert_eq!(format_label(&label, 0), "I");
213        assert_eq!(format_label(&label, 3), "IV");
214        assert_eq!(format_label(&label, 8), "IX");
215        assert_eq!(format_label(&label, 13), "XIV");
216    }
217
218    #[test]
219    fn test_lower_roman() {
220        let label = PageLabel {
221            style: Some(PageLabelStyle::LowerRoman),
222            prefix: None,
223            start: 1,
224        };
225        assert_eq!(format_label(&label, 0), "i");
226        assert_eq!(format_label(&label, 3), "iv");
227        assert_eq!(format_label(&label, 8), "ix");
228    }
229
230    #[test]
231    fn test_upper_alpha() {
232        let label = PageLabel {
233            style: Some(PageLabelStyle::UpperAlpha),
234            prefix: None,
235            start: 1,
236        };
237        assert_eq!(format_label(&label, 0), "A");
238        assert_eq!(format_label(&label, 1), "B");
239        assert_eq!(format_label(&label, 25), "Z");
240        assert_eq!(format_label(&label, 26), "AA");
241        assert_eq!(format_label(&label, 27), "AB");
242    }
243
244    #[test]
245    fn test_lower_alpha() {
246        let label = PageLabel {
247            style: Some(PageLabelStyle::LowerAlpha),
248            prefix: None,
249            start: 1,
250        };
251        assert_eq!(format_label(&label, 0), "a");
252        assert_eq!(format_label(&label, 25), "z");
253        assert_eq!(format_label(&label, 26), "aa");
254    }
255
256    #[test]
257    fn test_prefix_application() {
258        let label = PageLabel {
259            style: Some(PageLabelStyle::Decimal),
260            prefix: Some("Appendix-".to_string()),
261            start: 1,
262        };
263        assert_eq!(format_label(&label, 0), "Appendix-1");
264        assert_eq!(format_label(&label, 2), "Appendix-3");
265    }
266
267    #[test]
268    fn test_start_value_offset() {
269        let label = PageLabel {
270            style: Some(PageLabelStyle::Decimal),
271            prefix: None,
272            start: 10,
273        };
274        assert_eq!(format_label(&label, 0), "10");
275        assert_eq!(format_label(&label, 5), "15");
276    }
277
278    #[test]
279    fn test_no_style_prefix_only() {
280        let label = PageLabel {
281            style: None,
282            prefix: Some("Cover".to_string()),
283            start: 1,
284        };
285        assert_eq!(format_label(&label, 0), "Cover");
286    }
287
288    #[test]
289    fn test_roman_edge_cases() {
290        assert_eq!(to_roman(1, true), "I");
291        assert_eq!(to_roman(4, true), "IV");
292        assert_eq!(to_roman(9, true), "IX");
293        assert_eq!(to_roman(14, true), "XIV");
294        assert_eq!(to_roman(40, true), "XL");
295        assert_eq!(to_roman(90, true), "XC");
296        assert_eq!(to_roman(400, true), "CD");
297        assert_eq!(to_roman(900, true), "CM");
298        assert_eq!(to_roman(1999, true), "MCMXCIX");
299    }
300
301    #[test]
302    fn test_alpha_edge_cases() {
303        assert_eq!(to_alpha(1, true), "A");
304        assert_eq!(to_alpha(26, true), "Z");
305        assert_eq!(to_alpha(27, true), "AA");
306        assert_eq!(to_alpha(28, true), "AB");
307        assert_eq!(to_alpha(52, true), "AZ");
308        assert_eq!(to_alpha(53, true), "BA");
309    }
310
311    #[test]
312    fn test_multi_range_format_sequence() {
313        // Simulate a document with Roman preface (pages 0-3) then Decimal body (pages 4+)
314        let roman_label = PageLabel {
315            style: Some(PageLabelStyle::LowerRoman),
316            prefix: None,
317            start: 1,
318        };
319        assert_eq!(format_label(&roman_label, 0), "i");
320        assert_eq!(format_label(&roman_label, 1), "ii");
321        assert_eq!(format_label(&roman_label, 2), "iii");
322        assert_eq!(format_label(&roman_label, 3), "iv");
323
324        let decimal_label = PageLabel {
325            style: Some(PageLabelStyle::Decimal),
326            prefix: None,
327            start: 1,
328        };
329        assert_eq!(format_label(&decimal_label, 0), "1");
330        assert_eq!(format_label(&decimal_label, 1), "2");
331        assert_eq!(format_label(&decimal_label, 5), "6");
332    }
333
334    #[test]
335    fn test_format_label_no_style_no_prefix() {
336        let label = PageLabel {
337            style: None,
338            prefix: None,
339            start: 1,
340        };
341        assert_eq!(format_label(&label, 0), "");
342        assert_eq!(format_label(&label, 99), "");
343    }
344
345    #[test]
346    fn test_roman_zero_returns_empty() {
347        assert_eq!(to_roman(0, true), "");
348        assert_eq!(to_roman(0, false), "");
349    }
350
351    #[test]
352    fn test_alpha_zero_returns_empty() {
353        assert_eq!(to_alpha(0, true), "");
354        assert_eq!(to_alpha(0, false), "");
355    }
356
357    /// Upstream: TEST_F(PageLabelTest, GetLabelPerf)
358    ///
359    /// Performance test: iterates labels for 10001 pages using a complex
360    /// 3-level number tree. In rpdfium, we verify that format_label can
361    /// handle all the ranges that the upstream tree defines, exercising
362    /// each style (R, A, D, r, a) with various offsets including large ones.
363    #[test]
364    fn test_page_label_get_label_perf() {
365        // Build the label ranges matching the upstream tree:
366        //   0..99:    style=R (UpperRoman), start=1
367        //   100..899: style=A (UpperAlpha), prefix="abc", start=5
368        //   900..2999: style=D (Decimal), start=999
369        //   3000..4999: style=r (LowerRoman), start=1
370        //   5000..7999: style=a (LowerAlpha), start=1
371        //   8000..10000: style=None, prefix="x"
372        let ranges: Vec<(i64, PageLabel)> = vec![
373            (
374                0,
375                PageLabel {
376                    style: Some(PageLabelStyle::UpperRoman),
377                    prefix: None,
378                    start: 1,
379                },
380            ),
381            (
382                100,
383                PageLabel {
384                    style: Some(PageLabelStyle::UpperAlpha),
385                    prefix: Some("abc".to_string()),
386                    start: 5,
387                },
388            ),
389            (
390                900,
391                PageLabel {
392                    style: Some(PageLabelStyle::Decimal),
393                    prefix: None,
394                    start: 999,
395                },
396            ),
397            (
398                3000,
399                PageLabel {
400                    style: Some(PageLabelStyle::LowerRoman),
401                    prefix: None,
402                    start: 1,
403                },
404            ),
405            (
406                5000,
407                PageLabel {
408                    style: Some(PageLabelStyle::LowerAlpha),
409                    prefix: None,
410                    start: 1,
411                },
412            ),
413            (
414                8000,
415                PageLabel {
416                    style: None,
417                    prefix: Some("x".to_string()),
418                    start: 1,
419                },
420            ),
421        ];
422
423        // Iterate all 10001 pages (like upstream perf test)
424        for page_index in 0..10001i64 {
425            // Find the applicable range
426            let range_idx = ranges.partition_point(|(start, _)| *start <= page_index) - 1;
427            let (range_start, label) = &ranges[range_idx];
428            let offset = page_index - range_start;
429            let result = format_label(label, offset);
430            // Just verify it doesn't panic and produces non-empty output
431            // (except for pages 8000+ which have no style, prefix only)
432            if label.style.is_some() || label.prefix.is_some() {
433                assert!(!result.is_empty(), "page {page_index} produced empty label");
434            }
435        }
436
437        // Spot-check key values from the upstream test
438        let label_at = |page: i64| -> String {
439            let range_idx = ranges.partition_point(|(start, _)| *start <= page) - 1;
440            let (range_start, label) = &ranges[range_idx];
441            format_label(label, page - range_start)
442        };
443
444        assert_eq!(label_at(0), "I");
445        assert_eq!(label_at(1), "II");
446        assert_eq!(label_at(37), "XXXVIII");
447        assert_eq!(label_at(99), "C");
448        assert_eq!(label_at(100), "abcE");
449        assert_eq!(label_at(900), "999");
450        assert_eq!(label_at(901), "1000");
451        assert_eq!(label_at(3000), "i");
452        assert_eq!(label_at(5000), "a");
453        assert_eq!(label_at(8000), "x");
454        assert_eq!(label_at(10000), "x");
455    }
456}