bms_table/
fetch.rs

1//! Data fetching and HTML parsing helpers
2//!
3//! Provides HTML parsing when the `scraper` feature is enabled, used to extract the header JSON URL from
4//! `<meta name="bmstable" content="...">` in a page.
5//! Also provides a unified entry to parse a response string into the header JSON or its URL.
6//!
7//! # Examples
8//!
9//! ```rust
10//! # use bms_table::fetch::{get_web_header_json_value, HeaderQueryContent};
11//! let html = r#"
12//! <!DOCTYPE html>
13//! <html>
14//!   <head>
15//!     <meta name="bmstable" content="header.json">
16//!   </head>
17//!   <body></body>
18//! </html>
19//! "#;
20//! match get_web_header_json_value::<serde_json::Value>(html).unwrap() {
21//!     HeaderQueryContent::Url(u) => assert_eq!(u, "header.json"),
22//!     _ => unreachable!(),
23//! }
24//! ```
25#![cfg(feature = "scraper")]
26
27pub mod reqwest;
28
29use std::future::Future;
30
31use anyhow::{Context, Result, anyhow};
32use scraper::{ElementRef, Html, Selector};
33use serde::de::DeserializeOwned;
34
35use crate::{BmsTable, BmsTableInfo, BmsTableRaw};
36
37/// Result of fetching a table with its raw JSON strings.
38pub struct FetchedTable {
39    /// Parsed table.
40    pub table: BmsTable,
41    /// Raw JSON strings and resolved URLs.
42    pub raw: BmsTableRaw,
43}
44
45/// Result of fetching a table list with its raw JSON string.
46pub struct FetchedTableList {
47    /// Parsed list entries.
48    pub tables: Vec<BmsTableInfo>,
49    /// Raw JSON string actually used for parsing.
50    pub raw_json: String,
51}
52
53/// Unified interface for fetching BMS tables.
54pub trait TableFetcher {
55    /// Fetch and parse a complete BMS difficulty table, including raw JSON strings.
56    ///
57    /// # Errors
58    ///
59    /// Returns an error if fetching or parsing the table fails.
60    fn fetch_table(
61        &self,
62        web_url: url::Url,
63    ) -> impl Future<Output = Result<FetchedTable>> + Send + '_;
64
65    /// Fetch a list of BMS difficulty tables, including the raw JSON string.
66    ///
67    /// # Errors
68    ///
69    /// Returns an error if fetching or parsing the list fails.
70    fn fetch_table_list(
71        &self,
72        web_url: url::Url,
73    ) -> impl Future<Output = Result<FetchedTableList>> + Send + '_;
74}
75
76/// Return type of [`get_web_header_json_value`].
77///
78/// - If the input is HTML, returns the URL extracted from `<meta name="bmstable">`;
79/// - If the input is JSON, returns the parsed value of type `T`.
80pub enum HeaderQueryContent<T> {
81    /// Extracted header JSON URL.
82    ///
83    /// May be relative or absolute; prefer using `url::Url::join` to resolve.
84    Url(String),
85    /// Parsed header JSON content.
86    Value(T),
87}
88
89/// Remove non-printable control characters from JSON text.
90///
91/// Rationale: some sites return JSON with illegal control characters surrounding it.
92/// Cleaning prior to parsing improves compatibility while not affecting preservation of raw text.
93#[must_use]
94pub fn replace_control_chars(s: &str) -> String {
95    s.chars().filter(|ch: &char| !ch.is_control()).collect()
96}
97
98/// Parse JSON from a raw string with a cleaning fallback.
99///
100/// Tries to deserialize from the original `raw` first. If it fails, removes illegal
101/// control characters using [`replace_control_chars`] and retries. Returns the parsed
102/// value and the raw string that was successfully used.
103///
104/// # Errors
105///
106/// Returns an error when both the original and cleaned strings fail to deserialize.
107pub fn parse_json_str_with_fallback<T: DeserializeOwned>(raw: &str) -> Result<(T, String)> {
108    match serde_json::from_str::<T>(raw) {
109        Ok(v) => Ok((v, raw.to_string())),
110        Err(_) => {
111            let cleaned = replace_control_chars(raw);
112            let v = serde_json::from_str::<T>(&cleaned)?;
113            Ok((v, cleaned))
114        }
115    }
116}
117
118/// Parse a response string into the header JSON or its URL.
119///
120/// Strategy: first attempt to parse as JSON; if it fails, parse as HTML and extract the bmstable URL.
121///
122/// # Returns
123///
124/// - `HeaderQueryContent::Value`: input is JSON;
125/// - `HeaderQueryContent::Url`: input is HTML.
126///
127/// # Errors
128///
129/// Returns an error when the input is HTML but the bmstable field cannot be found.
130pub fn get_web_header_json_value<T: DeserializeOwned>(
131    response_str: &str,
132) -> Result<HeaderQueryContent<T>> {
133    // First try parsing as JSON (remove illegal control characters before parsing); if it fails, treat as HTML and extract the bmstable URL
134    let cleaned = replace_control_chars(response_str);
135    match serde_json::from_str::<T>(&cleaned) {
136        Ok(header_json) => Ok(HeaderQueryContent::Value(header_json)),
137        Err(_) => {
138            let bmstable_url = try_extract_bmstable_from_html(response_str)
139                .context("When extracting bmstable url")?;
140            Ok(HeaderQueryContent::Url(bmstable_url))
141        }
142    }
143}
144
145/// Extract the header query content from a response string with a fallback cleaning step.
146///
147/// Attempts [`get_web_header_json_value`] on `raw` first; on failure, retries with
148/// a control-character-cleaned string via [`replace_control_chars`]. Returns the content
149/// and the raw string actually used for the successful extraction.
150///
151/// # Errors
152///
153/// Returns an error when both attempts fail to extract a header URL or parse JSON.
154pub fn header_query_with_fallback<T: DeserializeOwned>(
155    raw: &str,
156) -> Result<(HeaderQueryContent<T>, String)> {
157    match get_web_header_json_value::<T>(raw) {
158        Ok(v) => Ok((v, raw.to_string())),
159        Err(_) => {
160            let cleaned = replace_control_chars(raw);
161            let v = get_web_header_json_value::<T>(&cleaned)?;
162            Ok((v, cleaned))
163        }
164    }
165}
166
167/// Extract the JSON file URL pointed to by the bmstable field from HTML page content.
168///
169/// Scans `<meta>` tags looking for elements with `name="bmstable"` and reads their `content` attribute.
170///
171/// # Errors
172///
173/// Returns an error when the target tag is not found or `content` is empty.
174pub fn try_extract_bmstable_from_html(html_content: &str) -> Result<String> {
175    let document = Html::parse_document(html_content);
176    let meta_selector = Selector::parse("meta").map_err(|_| anyhow!("meta tag not found"))?;
177    let link_selector = Selector::parse("link").ok();
178    let a_selector = Selector::parse("a").ok();
179    let script_selector = Selector::parse("script").ok();
180
181    let find_attr = |selector: &Selector,
182                     attr: &str,
183                     keep: &mut dyn FnMut(&ElementRef<'_>, &str) -> bool|
184     -> Option<String> {
185        for element in document.select(selector) {
186            if let Some(value) = element.value().attr(attr)
187                && keep(&element, value)
188            {
189                return Some(value.to_string());
190            }
191        }
192        None
193    };
194
195    let candidate = meta_bmstable(&document, &meta_selector)
196        .or_else(|| {
197            let mut keep = |element: &ElementRef<'_>, href: &str| {
198                element
199                    .value()
200                    .attr("rel")
201                    .is_some_and(|v| v.eq_ignore_ascii_case("bmstable"))
202                    && !href.is_empty()
203            };
204            link_selector
205                .as_ref()
206                .and_then(|sel| find_attr(sel, "href", &mut keep))
207        })
208        .or_else(|| {
209            let mut keep = |_: &ElementRef<'_>, href: &str| contains_header_json(href);
210            a_selector
211                .as_ref()
212                .and_then(|sel| find_attr(sel, "href", &mut keep))
213        })
214        .or_else(|| {
215            let mut keep = |_: &ElementRef<'_>, href: &str| contains_header_json(href);
216            link_selector
217                .as_ref()
218                .and_then(|sel| find_attr(sel, "href", &mut keep))
219        })
220        .or_else(|| {
221            let mut keep = |_: &ElementRef<'_>, src: &str| contains_header_json(src);
222            script_selector
223                .as_ref()
224                .and_then(|sel| find_attr(sel, "src", &mut keep))
225        })
226        .or_else(|| {
227            let mut keep = |_: &ElementRef<'_>, content: &str| contains_header_json(content);
228            find_attr(&meta_selector, "content", &mut keep)
229        })
230        .or_else(|| {
231            find_header_json_in_text(html_content)
232                .map(|(start, end)| html_content[start..end].to_string())
233        });
234
235    candidate.map_or_else(
236        || Err(anyhow!("bmstable field or header JSON hint not found")),
237        Ok,
238    )
239}
240
241/// Find the start and end indices of a substring like "*header*.json" in raw text.
242fn find_header_json_in_text(s: &str) -> Option<(usize, usize)> {
243    let lower = s.to_ascii_lowercase();
244    let mut pos = 0;
245    while let Some(idx) = lower[pos..].find("header") {
246        let global_idx = pos + idx;
247        // Look for .json after header
248        if let Some(json_rel) = lower[global_idx..].find(".json") {
249            let end = global_idx + json_rel + ".json".len();
250            // Try to find the nearest quote or whitespace before as the start
251            let start = lower[..global_idx]
252                .rfind(|c: char| c == '"' || c == '\'' || c.is_whitespace())
253                .map(|i| i + 1)
254                .unwrap_or(global_idx);
255            if end > start {
256                return Some((start, end));
257            }
258        }
259        pos = global_idx + 6; // skip "header"
260    }
261    None
262}
263
264/// Check whether the string contains "header" and ends with ".json".
265fn contains_header_json(s: &str) -> bool {
266    let ls = s.to_ascii_lowercase();
267    ls.contains("header") && ls.ends_with(".json")
268}
269
270/// Extract bmstable content from `<meta>` tags.
271fn meta_bmstable(document: &Html, meta_selector: &Selector) -> Option<String> {
272    for element in document.select(meta_selector) {
273        let is_bmstable = element
274            .value()
275            .attr("name")
276            .is_some_and(|v| v.eq_ignore_ascii_case("bmstable"))
277            || element
278                .value()
279                .attr("property")
280                .is_some_and(|v| v.eq_ignore_ascii_case("bmstable"));
281        if is_bmstable
282            && let Some(content_attr) = element.value().attr("content")
283            && !content_attr.is_empty()
284        {
285            return Some(content_attr.to_string());
286        }
287    }
288    None
289}