Skip to main content

drasi_bootstrap_http/
pagination.rs

1// Copyright 2025 The Drasi Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Pagination strategies for HTTP bootstrap requests.
16
17use anyhow::{anyhow, Result};
18use reqwest::header::HeaderMap;
19use serde_json::Value as JsonValue;
20use url::Url;
21
22use crate::config::PaginationConfig;
23
24/// Describes how to modify the next request for pagination.
25#[derive(Debug)]
26pub enum NextPage {
27    /// Modify query parameters on the original URL.
28    QueryParams(Vec<(String, String)>),
29    /// Use a completely new URL for the next request.
30    NewUrl(String),
31}
32
33/// Trait for pagination state machines.
34pub trait Paginator: Send + Sync {
35    /// Initialize the paginator, returning any query params for the first request.
36    fn initial_params(&self) -> Vec<(String, String)>;
37
38    /// Given the previous response body and headers, determine the next page request.
39    /// Returns None if there are no more pages.
40    fn next_page(
41        &mut self,
42        response_body: &JsonValue,
43        response_headers: &HeaderMap,
44        items_count: usize,
45    ) -> Result<Option<NextPage>>;
46}
47
48/// Validate that a pagination-followed URL shares the same scheme+host as the
49/// original endpoint URL (SSRF prevention).
50fn validate_pagination_url(next_url: &str, origin_host: &str) -> Result<String> {
51    let parsed =
52        Url::parse(next_url).map_err(|e| anyhow!("Invalid pagination URL '{next_url}': {e}"))?;
53
54    let scheme = parsed.scheme();
55    if scheme != "http" && scheme != "https" {
56        return Err(anyhow!(
57            "Pagination URL has disallowed scheme '{scheme}': {next_url}"
58        ));
59    }
60
61    let host = parsed
62        .host_str()
63        .ok_or_else(|| anyhow!("Pagination URL has no host: {next_url}"))?;
64
65    if host != origin_host {
66        return Err(anyhow!(
67            "Pagination URL host '{host}' does not match origin host '{origin_host}' (SSRF protection)"
68        ));
69    }
70
71    Ok(next_url.to_string())
72}
73
74/// Extract the host from a URL string for SSRF origin validation.
75pub fn extract_origin_host(url: &str) -> Option<String> {
76    Url::parse(url)
77        .ok()
78        .and_then(|u| u.host_str().map(|h| h.to_string()))
79}
80
81/// Create a paginator from configuration.
82pub fn create_paginator(config: &PaginationConfig, origin_host: String) -> Box<dyn Paginator> {
83    match config {
84        PaginationConfig::OffsetLimit {
85            offset_param,
86            limit_param,
87            page_size,
88            total_path,
89        } => Box::new(OffsetLimitPaginator {
90            offset_param: offset_param.clone(),
91            limit_param: limit_param.clone(),
92            page_size: *page_size,
93            total_path: total_path.clone(),
94            current_offset: 0,
95        }),
96        PaginationConfig::PageNumber {
97            page_param,
98            page_size_param,
99            page_size,
100            total_pages_path,
101        } => Box::new(PageNumberPaginator {
102            page_param: page_param.clone(),
103            page_size_param: page_size_param.clone(),
104            page_size: *page_size,
105            total_pages_path: total_pages_path.clone(),
106            current_page: 1,
107        }),
108        PaginationConfig::Cursor {
109            cursor_param,
110            cursor_path,
111            has_more_path,
112            page_size_param,
113            page_size,
114        } => Box::new(CursorPaginator {
115            cursor_param: cursor_param.clone(),
116            cursor_path: cursor_path.clone(),
117            has_more_path: has_more_path.clone(),
118            page_size_param: page_size_param.clone(),
119            page_size: *page_size,
120        }),
121        PaginationConfig::LinkHeader {
122            page_size_param,
123            page_size,
124        } => Box::new(LinkHeaderPaginator {
125            page_size_param: page_size_param.clone(),
126            page_size: *page_size,
127            origin_host: origin_host.clone(),
128        }),
129        PaginationConfig::NextUrl {
130            next_url_path,
131            base_url,
132        } => Box::new(NextUrlPaginator {
133            next_url_path: next_url_path.clone(),
134            base_url: base_url.clone(),
135            origin_host,
136        }),
137    }
138}
139
140// ── Offset/Limit ────────────────────────────────────────────────────────────
141
142struct OffsetLimitPaginator {
143    offset_param: String,
144    limit_param: String,
145    page_size: u64,
146    total_path: Option<String>,
147    current_offset: u64,
148}
149
150impl Paginator for OffsetLimitPaginator {
151    fn initial_params(&self) -> Vec<(String, String)> {
152        vec![
153            (self.offset_param.clone(), "0".to_string()),
154            (self.limit_param.clone(), self.page_size.to_string()),
155        ]
156    }
157
158    fn next_page(
159        &mut self,
160        response_body: &JsonValue,
161        _response_headers: &HeaderMap,
162        items_count: usize,
163    ) -> Result<Option<NextPage>> {
164        self.current_offset += self.page_size;
165
166        // If we got fewer items than page_size, we're done
167        if (items_count as u64) < self.page_size {
168            return Ok(None);
169        }
170
171        // If total_path is set, check if we've fetched everything
172        if let Some(ref total_path) = self.total_path {
173            if let Some(total) = extract_json_path_u64(response_body, total_path) {
174                if self.current_offset >= total {
175                    return Ok(None);
176                }
177            }
178        }
179
180        Ok(Some(NextPage::QueryParams(vec![
181            (self.offset_param.clone(), self.current_offset.to_string()),
182            (self.limit_param.clone(), self.page_size.to_string()),
183        ])))
184    }
185}
186
187// ── Page Number ─────────────────────────────────────────────────────────────
188
189struct PageNumberPaginator {
190    page_param: String,
191    page_size_param: String,
192    page_size: u64,
193    total_pages_path: Option<String>,
194    current_page: u64,
195}
196
197impl Paginator for PageNumberPaginator {
198    fn initial_params(&self) -> Vec<(String, String)> {
199        vec![
200            (self.page_param.clone(), "1".to_string()),
201            (self.page_size_param.clone(), self.page_size.to_string()),
202        ]
203    }
204
205    fn next_page(
206        &mut self,
207        response_body: &JsonValue,
208        _response_headers: &HeaderMap,
209        items_count: usize,
210    ) -> Result<Option<NextPage>> {
211        self.current_page += 1;
212
213        // If we got fewer items than page_size, we're done
214        if (items_count as u64) < self.page_size {
215            return Ok(None);
216        }
217
218        // If total_pages_path is set, check if we've exceeded total pages
219        if let Some(ref total_path) = self.total_pages_path {
220            if let Some(total_pages) = extract_json_path_u64(response_body, total_path) {
221                if self.current_page > total_pages {
222                    return Ok(None);
223                }
224            }
225        }
226
227        Ok(Some(NextPage::QueryParams(vec![
228            (self.page_param.clone(), self.current_page.to_string()),
229            (self.page_size_param.clone(), self.page_size.to_string()),
230        ])))
231    }
232}
233
234// ── Cursor ──────────────────────────────────────────────────────────────────
235
236struct CursorPaginator {
237    cursor_param: String,
238    cursor_path: String,
239    has_more_path: Option<String>,
240    page_size_param: Option<String>,
241    page_size: Option<u64>,
242}
243
244impl Paginator for CursorPaginator {
245    fn initial_params(&self) -> Vec<(String, String)> {
246        let mut params = Vec::new();
247        if let (Some(ref param), Some(size)) = (&self.page_size_param, self.page_size) {
248            params.push((param.clone(), size.to_string()));
249        }
250        params
251    }
252
253    fn next_page(
254        &mut self,
255        response_body: &JsonValue,
256        _response_headers: &HeaderMap,
257        items_count: usize,
258    ) -> Result<Option<NextPage>> {
259        // If has_more_path is set, check it
260        if let Some(ref has_more_path) = self.has_more_path {
261            if let Some(has_more) = extract_json_path_bool(response_body, has_more_path) {
262                if !has_more {
263                    return Ok(None);
264                }
265            }
266        }
267
268        // If no items were returned, we're done
269        if items_count == 0 {
270            return Ok(None);
271        }
272
273        // Extract cursor value for next request
274        let cursor = extract_json_path_string(response_body, &self.cursor_path);
275        match cursor {
276            Some(cursor_value) if !cursor_value.is_empty() => {
277                let mut params = vec![(self.cursor_param.clone(), cursor_value)];
278                if let (Some(ref param), Some(size)) = (&self.page_size_param, self.page_size) {
279                    params.push((param.clone(), size.to_string()));
280                }
281                Ok(Some(NextPage::QueryParams(params)))
282            }
283            _ => Ok(None),
284        }
285    }
286}
287
288// ── Link Header ─────────────────────────────────────────────────────────────
289
290struct LinkHeaderPaginator {
291    page_size_param: Option<String>,
292    page_size: Option<u64>,
293    origin_host: String,
294}
295
296impl Paginator for LinkHeaderPaginator {
297    fn initial_params(&self) -> Vec<(String, String)> {
298        let mut params = Vec::new();
299        if let (Some(ref param), Some(size)) = (&self.page_size_param, self.page_size) {
300            params.push((param.clone(), size.to_string()));
301        }
302        params
303    }
304
305    fn next_page(
306        &mut self,
307        _response_body: &JsonValue,
308        response_headers: &HeaderMap,
309        items_count: usize,
310    ) -> Result<Option<NextPage>> {
311        if items_count == 0 {
312            return Ok(None);
313        }
314
315        let next_url = parse_link_header_next(response_headers);
316        match next_url {
317            Some(url) => {
318                let validated = validate_pagination_url(&url, &self.origin_host)?;
319                Ok(Some(NextPage::NewUrl(validated)))
320            }
321            None => Ok(None),
322        }
323    }
324}
325
326// ── Next URL ────────────────────────────────────────────────────────────────
327
328struct NextUrlPaginator {
329    next_url_path: String,
330    base_url: Option<String>,
331    origin_host: String,
332}
333
334impl Paginator for NextUrlPaginator {
335    fn initial_params(&self) -> Vec<(String, String)> {
336        Vec::new()
337    }
338
339    fn next_page(
340        &mut self,
341        response_body: &JsonValue,
342        _response_headers: &HeaderMap,
343        _items_count: usize,
344    ) -> Result<Option<NextPage>> {
345        let next_url = extract_json_path_string(response_body, &self.next_url_path);
346        match next_url {
347            Some(url) if !url.is_empty() => {
348                // If it's a relative URL and we have a base_url, combine them
349                let full_url = if url.starts_with("http://") || url.starts_with("https://") {
350                    url
351                } else if let Some(ref base) = self.base_url {
352                    format!("{}{}", base.trim_end_matches('/'), url)
353                } else {
354                    url
355                };
356                let validated = validate_pagination_url(&full_url, &self.origin_host)?;
357                Ok(Some(NextPage::NewUrl(validated)))
358            }
359            _ => Ok(None),
360        }
361    }
362}
363
364// ── Helper functions ────────────────────────────────────────────────────────
365
366/// Extract a string value from a JSON document using a simple path expression.
367/// Supports dot-notation paths like "$.data[-1].id" or "$.nextRecordsUrl".
368pub fn extract_json_path_string(value: &JsonValue, path: &str) -> Option<String> {
369    let result = navigate_path(value, path)?;
370    match result {
371        JsonValue::String(s) => Some(s.clone()),
372        JsonValue::Number(n) => Some(n.to_string()),
373        JsonValue::Bool(b) => Some(b.to_string()),
374        JsonValue::Null => None,
375        _ => Some(result.to_string()),
376    }
377}
378
379/// Extract a u64 value from a JSON document using a path expression.
380pub fn extract_json_path_u64(value: &JsonValue, path: &str) -> Option<u64> {
381    let result = navigate_path(value, path)?;
382    result.as_u64()
383}
384
385/// Extract a boolean value from a JSON document using a path expression.
386pub fn extract_json_path_bool(value: &JsonValue, path: &str) -> Option<bool> {
387    let result = navigate_path(value, path)?;
388    result.as_bool()
389}
390
391/// Navigate a JSON value using a simple JSONPath-like expression.
392/// Supports: $.field, $.field.nested, $.array[0], $.array[-1]
393pub fn navigate_path<'a>(value: &'a JsonValue, path: &str) -> Option<&'a JsonValue> {
394    let path = path
395        .strip_prefix("$.")
396        .unwrap_or(path.strip_prefix("$").unwrap_or(path));
397
398    if path.is_empty() {
399        return Some(value);
400    }
401
402    let mut current = value;
403    for segment in split_path_segments(path) {
404        current = navigate_segment(current, &segment)?;
405    }
406    Some(current)
407}
408
409/// Split a path into segments, handling bracket notation.
410fn split_path_segments(path: &str) -> Vec<String> {
411    let mut segments = Vec::new();
412    let mut current = String::new();
413
414    let chars: Vec<char> = path.chars().collect();
415    let mut i = 0;
416
417    while i < chars.len() {
418        match chars[i] {
419            '.' => {
420                if !current.is_empty() {
421                    segments.push(current.clone());
422                    current.clear();
423                }
424            }
425            '[' => {
426                if !current.is_empty() {
427                    segments.push(current.clone());
428                    current.clear();
429                }
430                // Find closing bracket
431                let mut bracket_content = String::new();
432                i += 1;
433                while i < chars.len() && chars[i] != ']' {
434                    bracket_content.push(chars[i]);
435                    i += 1;
436                }
437                segments.push(format!("[{bracket_content}]"));
438            }
439            c => {
440                current.push(c);
441            }
442        }
443        i += 1;
444    }
445
446    if !current.is_empty() {
447        segments.push(current);
448    }
449
450    segments
451}
452
453/// Navigate a single path segment.
454fn navigate_segment<'a>(value: &'a JsonValue, segment: &str) -> Option<&'a JsonValue> {
455    if let Some(index_str) = segment.strip_prefix('[').and_then(|s| s.strip_suffix(']')) {
456        // Array index
457        let arr = value.as_array()?;
458        if arr.is_empty() {
459            return None;
460        }
461        let index: i64 = index_str.parse().ok()?;
462        let len = arr.len() as i64;
463        let actual_index = if index < 0 {
464            // Bounds check: ensure -index <= len
465            if -index > len {
466                return None;
467            }
468            (len + index) as usize
469        } else {
470            index as usize
471        };
472        arr.get(actual_index)
473    } else {
474        // Object field
475        value.get(segment)
476    }
477}
478
479/// Parse the Link header to find the URL with rel="next".
480/// Uses bracket-aware splitting to handle commas inside URL angle brackets.
481/// Parses parameters per RFC 5988 (split on `;`, trim, exact-match `rel="next"`).
482fn parse_link_header_next(headers: &HeaderMap) -> Option<String> {
483    let link_header = headers.get("link")?.to_str().ok()?;
484
485    // Split on commas that are outside angle brackets
486    for part in split_link_header(link_header) {
487        let part = part.trim();
488        // Parse parameters per RFC 5988: split on ';' and check each param
489        if has_rel_next(part) {
490            // Extract URL between < and >
491            if let Some(start) = part.find('<') {
492                if let Some(end) = part.find('>') {
493                    return Some(part[start + 1..end].to_string());
494                }
495            }
496        }
497    }
498
499    None
500}
501
502/// Check if a Link header part has an exact `rel="next"` or `rel='next'` parameter.
503/// Splits on `;` per RFC 5988 and performs exact-match on each parameter value.
504fn has_rel_next(part: &str) -> bool {
505    for param in part.split(';') {
506        let param = param.trim();
507        if param.eq_ignore_ascii_case("rel=\"next\"") || param.eq_ignore_ascii_case("rel='next'") {
508            return true;
509        }
510    }
511    false
512}
513
514/// Split a Link header value on commas that are outside angle brackets.
515fn split_link_header(header: &str) -> Vec<&str> {
516    let mut parts = Vec::new();
517    let mut depth = 0u32;
518    let mut start = 0;
519
520    for (i, c) in header.char_indices() {
521        match c {
522            '<' => depth += 1,
523            '>' => depth = depth.saturating_sub(1),
524            ',' if depth == 0 => {
525                parts.push(&header[start..i]);
526                start = i + 1;
527            }
528            _ => {}
529        }
530    }
531    parts.push(&header[start..]);
532    parts
533}
534
535#[cfg(test)]
536mod tests {
537    use super::*;
538    use serde_json::json;
539
540    #[test]
541    fn test_extract_simple_path() {
542        let data = json!({"data": {"total": 100}});
543        assert_eq!(extract_json_path_u64(&data, "$.data.total"), Some(100));
544    }
545
546    #[test]
547    fn test_extract_array_last() {
548        let data = json!({"data": [{"id": "a"}, {"id": "b"}, {"id": "c"}]});
549        assert_eq!(
550            extract_json_path_string(&data, "$.data[-1].id"),
551            Some("c".to_string())
552        );
553    }
554
555    #[test]
556    fn test_extract_bool() {
557        let data = json!({"has_more": true});
558        assert_eq!(extract_json_path_bool(&data, "$.has_more"), Some(true));
559    }
560
561    #[test]
562    fn test_extract_missing_path() {
563        let data = json!({"data": {}});
564        assert_eq!(extract_json_path_string(&data, "$.nonexistent"), None);
565    }
566
567    #[test]
568    fn test_parse_link_header() {
569        let mut headers = HeaderMap::new();
570        headers.insert(
571            "link",
572            r#"<https://api.github.com/repos?page=3>; rel="next", <https://api.github.com/repos?page=50>; rel="last""#
573                .parse()
574                .unwrap(),
575        );
576        assert_eq!(
577            parse_link_header_next(&headers),
578            Some("https://api.github.com/repos?page=3".to_string())
579        );
580    }
581
582    #[test]
583    fn test_parse_link_header_no_next() {
584        let mut headers = HeaderMap::new();
585        headers.insert(
586            "link",
587            r#"<https://api.github.com/repos?page=1>; rel="first""#
588                .parse()
589                .unwrap(),
590        );
591        assert_eq!(parse_link_header_next(&headers), None);
592    }
593
594    #[test]
595    fn test_offset_limit_paginator() {
596        let config = PaginationConfig::OffsetLimit {
597            offset_param: "offset".to_string(),
598            limit_param: "limit".to_string(),
599            page_size: 10,
600            total_path: None,
601        };
602
603        let mut paginator = create_paginator(&config, "example.com".to_string());
604        let initial = paginator.initial_params();
605        assert_eq!(
606            initial,
607            vec![
608                ("offset".to_string(), "0".to_string()),
609                ("limit".to_string(), "10".to_string())
610            ]
611        );
612
613        // Full page → should have next
614        let headers = HeaderMap::new();
615        let body = json!({});
616        let next = paginator.next_page(&body, &headers, 10).unwrap();
617        assert!(next.is_some());
618
619        // Partial page → should be done
620        let next = paginator.next_page(&body, &headers, 5).unwrap();
621        assert!(next.is_none());
622    }
623
624    #[test]
625    fn test_cursor_paginator_with_has_more() {
626        let config = PaginationConfig::Cursor {
627            cursor_param: "starting_after".to_string(),
628            cursor_path: "$.data[-1].id".to_string(),
629            has_more_path: Some("$.has_more".to_string()),
630            page_size_param: Some("limit".to_string()),
631            page_size: Some(10),
632        };
633
634        let mut paginator = create_paginator(&config, "example.com".to_string());
635
636        let headers = HeaderMap::new();
637        let body = json!({"data": [{"id": "a"}, {"id": "b"}], "has_more": true});
638        let next = paginator.next_page(&body, &headers, 2).unwrap();
639        assert!(next.is_some());
640
641        let body = json!({"data": [{"id": "c"}], "has_more": false});
642        let next = paginator.next_page(&body, &headers, 1).unwrap();
643        assert!(next.is_none());
644    }
645
646    #[test]
647    fn test_next_url_paginator() {
648        let config = PaginationConfig::NextUrl {
649            next_url_path: "$.nextRecordsUrl".to_string(),
650            base_url: Some("https://instance.salesforce.com".to_string()),
651        };
652
653        let mut paginator = create_paginator(&config, "instance.salesforce.com".to_string());
654        let headers = HeaderMap::new();
655
656        let body = json!({"nextRecordsUrl": "/services/data/v56.0/query/abc-123"});
657        let next = paginator.next_page(&body, &headers, 10).unwrap();
658        match next {
659            Some(NextPage::NewUrl(url)) => {
660                assert_eq!(
661                    url,
662                    "https://instance.salesforce.com/services/data/v56.0/query/abc-123"
663                );
664            }
665            _ => panic!("Expected NewUrl"),
666        }
667
668        // No next URL → done
669        let body = json!({"records": []});
670        let next = paginator.next_page(&body, &headers, 0).unwrap();
671        assert!(next.is_none());
672    }
673
674    #[test]
675    fn test_negative_index_out_of_bounds() {
676        let data = json!({"data": [{"id": "a"}, {"id": "b"}]});
677        // -3 on a 2-element array should return None, not wrap
678        assert_eq!(extract_json_path_string(&data, "$.data[-3].id"), None);
679        // -2 should work (first element)
680        assert_eq!(
681            extract_json_path_string(&data, "$.data[-2].id"),
682            Some("a".to_string())
683        );
684    }
685
686    #[test]
687    fn test_navigate_path_top_level_array() {
688        let data = json!([{"id": "1"}, {"id": "2"}]);
689        let result = navigate_path(&data, "$");
690        assert!(result.is_some());
691        assert!(result.unwrap().is_array());
692    }
693
694    #[test]
695    fn test_ssrf_protection_rejects_different_host() {
696        let config = PaginationConfig::NextUrl {
697            next_url_path: "$.next".to_string(),
698            base_url: None,
699        };
700
701        let mut paginator = create_paginator(&config, "api.example.com".to_string());
702        let headers = HeaderMap::new();
703
704        // Attacker injects an internal URL in the response
705        let body = json!({"next": "http://169.254.169.254/latest/meta-data/"}); // DevSkim: ignore DS137138
706        let result = paginator.next_page(&body, &headers, 10);
707        assert!(result.is_err(), "Should reject URL to different host");
708        let err_msg = format!("{}", result.unwrap_err());
709        assert!(
710            err_msg.contains("SSRF protection"),
711            "Error should mention SSRF: {err_msg}"
712        );
713    }
714
715    #[test]
716    fn test_ssrf_protection_allows_same_host() {
717        let config = PaginationConfig::NextUrl {
718            next_url_path: "$.next".to_string(),
719            base_url: None,
720        };
721
722        let mut paginator = create_paginator(&config, "api.example.com".to_string());
723        let headers = HeaderMap::new();
724
725        let body = json!({"next": "https://api.example.com/page/2"});
726        let result = paginator.next_page(&body, &headers, 10).unwrap();
727        assert!(matches!(result, Some(NextPage::NewUrl(_))));
728    }
729
730    #[test]
731    fn test_ssrf_protection_rejects_non_http_scheme() {
732        let config = PaginationConfig::NextUrl {
733            next_url_path: "$.next".to_string(),
734            base_url: None,
735        };
736
737        let mut paginator = create_paginator(&config, "api.example.com".to_string());
738        let headers = HeaderMap::new();
739
740        let body = json!({"next": "file:///etc/passwd"});
741        let result = paginator.next_page(&body, &headers, 10);
742        assert!(result.is_err(), "Should reject non-HTTP scheme");
743    }
744}