bms_table/fetch/
reqwest.rs

1//! Network fetching module based on `reqwest`
2//!
3//! Provides an all-in-one ability to fetch and parse BMS difficulty tables from a web page or a header JSON source:
4//! - Fetch the page and extract the bmstable header URL from HTML (if present);
5//! - Download and parse the header JSON;
6//! - Download and parse chart data according to `data_url` in the header;
7//! - Return a `BmsTable` containing the header and the chart set.
8//!
9//! # Example
10//!
11//! ```rust,no_run
12//! # #[tokio::main]
13//! # async fn main() -> anyhow::Result<()> {
14//! use bms_table::fetch::reqwest::{fetch_table, make_lenient_client};
15//! let client = make_lenient_client()?;
16//! let table = fetch_table(&client, "https://stellabms.xyz/sl/table.html").await?;
17//! assert!(!table.data.charts.is_empty());
18//! # Ok(())
19//! # }
20//! ```
21#![cfg(feature = "reqwest")]
22
23use anyhow::{Context, Result, anyhow};
24use reqwest::header::{HeaderMap, HeaderName, HeaderValue};
25use serde::de::DeserializeOwned;
26use std::time::Duration;
27use url::Url;
28
29use crate::{
30    BmsTable, BmsTableData, BmsTableHeader, BmsTableInfo, BmsTableList, BmsTableRaw,
31    fetch::{HeaderQueryContent, get_web_header_json_value, replace_control_chars},
32};
33
34/// Fetch and parse a complete BMS difficulty table from a web page or a header JSON source.
35///
36/// # Parameters
37///
38/// - `web_url`: page URL or an URL pointing directly to the header JSON.
39///
40/// # Returns
41///
42/// Parsed [`crate::BmsTable`], containing header and chart data.
43///
44/// # Errors
45///
46/// - Network request failures (connection failure, timeout, etc.)
47/// - Response content cannot be parsed as HTML/JSON or structure is unexpected
48/// - Header JSON does not contain `data_url` or has the wrong type
49pub async fn fetch_table_full(
50    client: &reqwest::Client,
51    web_url: &str,
52) -> Result<(BmsTable, BmsTableRaw)> {
53    let web_url = Url::parse(web_url).context("When parsing web url")?;
54    let web_response = client
55        .get(web_url.clone())
56        .send()
57        .await
58        .context("When fetching web")?
59        .text()
60        .await
61        .context("When parsing web response")?;
62    let (hq, web_used_raw) = header_query_with_fallback::<BmsTableHeader>(&web_response)
63        .context("When parsing header query")?;
64    let (header_url, header, header_raw) = match hq {
65        HeaderQueryContent::Url(header_url_string) => {
66            let header_url = web_url
67                .join(&header_url_string)
68                .context("When joining header url")?;
69            let header_response = client
70                .get(header_url.clone())
71                .send()
72                .await
73                .context("When fetching header")?;
74            let header_response_string = header_response
75                .text()
76                .await
77                .context("When parsing header response")?;
78            let (hq2, raw2) = header_query_with_fallback::<BmsTableHeader>(&header_response_string)
79                .context("When parsing header query")?;
80            let HeaderQueryContent::Value(v) = hq2 else {
81                return Err(anyhow!(
82                    "Cycled header found. web_url: {web_url}, header_url: {header_url_string}"
83                ));
84            };
85            (header_url, v, raw2)
86        }
87        HeaderQueryContent::Value(value) => (web_url, value, web_used_raw),
88    };
89    let data_url = header_url
90        .join(&header.data_url)
91        .context("When joining data url")?;
92    let data_response = client
93        .get(data_url.clone())
94        .send()
95        .await
96        .context("When fetching web")?
97        .text()
98        .await
99        .context("When parsing web response")?;
100    let (data, data_raw_str) = parse_json_str_with_fallback::<BmsTableData>(&data_response)
101        .context("When parsing data json")?;
102    Ok((
103        BmsTable { header, data },
104        BmsTableRaw {
105            header_json_url: header_url,
106            header_raw,
107            data_json_url: data_url,
108            data_raw: data_raw_str,
109        },
110    ))
111}
112
113/// Fetch and parse a complete BMS difficulty table.
114///
115/// See [`fetch_table_full`].
116pub async fn fetch_table(client: &reqwest::Client, web_url: &str) -> Result<BmsTable> {
117    let (table, _raw) = fetch_table_full(client, web_url)
118        .await
119        .context("When fetching full table")?;
120    Ok(table)
121}
122
123/// Fetch a list of BMS difficulty tables.
124///
125/// Downloads a JSON array from the provided `web_url` and parses it into a list of [`crate::BmsTableInfo`].
126/// Each item only requires `name`, `symbol`, and `url` (string); all other fields are collected into `extra`.
127pub async fn fetch_table_list(
128    client: &reqwest::Client,
129    web_url: &str,
130) -> Result<Vec<BmsTableInfo>> {
131    let (out, _raw) = fetch_table_list_full(client, web_url)
132        .await
133        .context("When fetching table list full")?;
134    Ok(out)
135}
136
137/// Fetch a list of BMS difficulty tables along with the raw JSON string.
138///
139/// Returns the parsed array of list entries and the raw JSON response text for recording or debugging.
140pub async fn fetch_table_list_full(
141    client: &reqwest::Client,
142    web_url: &str,
143) -> Result<(Vec<BmsTableInfo>, String)> {
144    let web_url = Url::parse(web_url).context("When parsing table list url")?;
145    let response_text = client
146        .get(web_url)
147        .send()
148        .await
149        .context("When fetching table list")?
150        .text()
151        .await
152        .context("When parsing table list response")?;
153    let (list, raw_used) = parse_json_str_with_fallback::<BmsTableList>(&response_text)
154        .context("When parsing table list json")?;
155    let out: Vec<BmsTableInfo> = list.listes;
156    Ok((out, raw_used))
157}
158
159/// Create a more lenient and compatible HTTP client.
160///
161/// - Set a browser-like UA;
162/// - Configure timeouts and redirects;
163/// - Accept invalid certificates (for a few non-compliant sites);
164/// - Accept invalid hostnames (for a few non-compliant sites);
165///
166/// Note: use `danger_accept_invalid_certs` with caution in production.
167pub fn make_lenient_client() -> Result<reqwest::Client> {
168    // Default headers emulate real browser behavior more closely
169    let mut headers = HeaderMap::new();
170    headers.insert(
171        HeaderName::from_static("accept"),
172        HeaderValue::from_static(
173            "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
174        ),
175    );
176    headers.insert(
177        HeaderName::from_static("accept-language"),
178        HeaderValue::from_static("zh-CN,zh;q=0.9,en;q=0.8"),
179    );
180    headers.insert(
181        HeaderName::from_static("upgrade-insecure-requests"),
182        HeaderValue::from_static("1"),
183    );
184    headers.insert(
185        HeaderName::from_static("connection"),
186        HeaderValue::from_static("keep-alive"),
187    );
188
189    let client = reqwest::Client::builder()
190        .default_headers(headers)
191        .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119 Safari/537.36 bms-table-rs")
192        .timeout(Duration::from_secs(60))
193        .redirect(reqwest::redirect::Policy::limited(100))
194        // Automatically include Referer on redirects, closer to browser behavior
195        .referer(true)
196        // Enable cookie store, closer to real user sessions
197        .cookie_store(true)
198        // Keep lenient TLS settings for compatibility with some non-compliant sites
199        .danger_accept_invalid_certs(true)
200        .danger_accept_invalid_hostnames(true)
201        .build()
202        .context("When building client")?;
203    Ok(client)
204}
205
206/// Parse JSON from a raw string with a fallback.
207///
208/// Tries to deserialize from the original `raw` first; if it fails,
209/// removes illegal control characters and retries. Returns the parsed
210/// value and the raw string actually used for the successful parse.
211fn parse_json_str_with_fallback<T: DeserializeOwned>(raw: &str) -> Result<(T, String)> {
212    match serde_json::from_str::<T>(raw) {
213        Ok(v) => Ok((v, raw.to_string())),
214        Err(_) => {
215            let cleaned = replace_control_chars(raw);
216            let v = serde_json::from_str::<T>(&cleaned).context("When parsing cleaned json")?;
217            Ok((v, cleaned))
218        }
219    }
220}
221
222/// Extract header query content from a page string with a fallback.
223///
224/// Attempts `get_web_header_json_value(raw)` first; on failure, retries
225/// with a control-character-cleaned string. Returns the content and the
226/// raw string actually used for the successful extraction.
227fn header_query_with_fallback<T: DeserializeOwned>(
228    raw: &str,
229) -> Result<(HeaderQueryContent<T>, String)> {
230    match get_web_header_json_value::<T>(raw) {
231        Ok(v) => Ok((v, raw.to_string())),
232        Err(_) => {
233            let cleaned = replace_control_chars(raw);
234            let v = get_web_header_json_value::<T>(&cleaned)
235                .context("When extracting header from cleaned text")?;
236            Ok((v, cleaned))
237        }
238    }
239}