Skip to main content

web_capture/
stackoverflow.rs

1//! Helpers for capturing Stack Overflow question pages.
2//!
3//! Direct non-browser fetches of question pages can receive an anti-bot
4//! challenge instead of the question content. For HTML-derived outputs we use
5//! the public `StackPrinter` export for the same question, while screenshot
6//! capture continues to use the original page URL.
7
8use crate::{Result, WebCaptureError};
9use std::time::Duration;
10use tokio::time::sleep;
11use url::Url;
12
13const STACKPRINTER_RETRIES: usize = 3;
14const STACKPRINTER_RETRY_BASE_DELAY_MS: u64 = 1_000;
15
16/// Return true when `url` points at a Stack Overflow question page.
17#[must_use]
18pub fn is_stackoverflow_question_url(url: &str) -> bool {
19    stackoverflow_question_id(url).is_some()
20}
21
22/// Build the public `StackPrinter` URL for a Stack Overflow question.
23#[must_use]
24pub fn stackprinter_url(url: &str) -> Option<String> {
25    let question_id = stackoverflow_question_id(url)?;
26    Some(format!(
27        "https://stackprinter.appspot.com/export?question={question_id}&service=stackoverflow&language=en&hideAnswers=false&showAll=true&width=640"
28    ))
29}
30
31/// Fetch a Stack Overflow question through `StackPrinter`.
32///
33/// # Errors
34///
35/// Returns an error if `url` is not a Stack Overflow question URL, or if the
36/// network request fails.
37pub async fn fetch_stackoverflow_html(url: &str) -> Result<String> {
38    let stackprinter =
39        stackprinter_url(url).ok_or_else(|| WebCaptureError::InvalidUrl(url.to_string()))?;
40    let mut last_error = None;
41
42    for attempt in 0..=STACKPRINTER_RETRIES {
43        match fetch_stackprinter_html_once(&stackprinter).await {
44            Ok(html) if !is_stackprinter_transient_error(&html) => return Ok(html),
45            Ok(_) => {
46                last_error = Some("StackPrinter returned a transient error page".to_string());
47            }
48            Err(error) => {
49                last_error = Some(error.to_string());
50            }
51        }
52
53        if attempt < STACKPRINTER_RETRIES {
54            let delay_factor = 2_u64.pow(u32::try_from(attempt).expect("retry attempt fits u32"));
55            sleep(Duration::from_millis(
56                STACKPRINTER_RETRY_BASE_DELAY_MS * delay_factor,
57            ))
58            .await;
59        }
60    }
61
62    Err(WebCaptureError::FetchError(last_error.unwrap_or_else(
63        || "StackPrinter failed without an error message".to_string(),
64    )))
65}
66
67async fn fetch_stackprinter_html_once(stackprinter: &str) -> Result<String> {
68    let response = reqwest::get(stackprinter)
69        .await
70        .and_then(reqwest::Response::error_for_status)
71        .map_err(|error| WebCaptureError::FetchError(error.to_string()))?;
72    response
73        .text()
74        .await
75        .map_err(|error| WebCaptureError::FetchError(error.to_string()))
76}
77
78#[must_use]
79fn is_stackprinter_transient_error(html: &str) -> bool {
80    html.contains("Ooooops") || html.contains("Please try again later")
81}
82
83fn stackoverflow_question_id(url: &str) -> Option<String> {
84    let parsed = Url::parse(url).ok()?;
85    let host = parsed.host_str()?.trim_start_matches("www.");
86    if host != "stackoverflow.com" {
87        return None;
88    }
89
90    let mut segments = parsed.path_segments()?;
91    if segments.next()? != "questions" {
92        return None;
93    }
94
95    let question_id = segments.next()?;
96    if question_id
97        .chars()
98        .all(|character| character.is_ascii_digit())
99    {
100        Some(question_id.to_string())
101    } else {
102        None
103    }
104}