bms_table/fetch.rs
1//! Data fetching and HTML parsing helpers
2//!
3//! Provides HTML parsing when the `scraper` feature is enabled, used to extract the header JSON URL from
4//! `<meta name="bmstable" content="...">` in a page.
5//! Also provides a unified entry to parse a response string into the header JSON or its URL.
6//!
7//! # Examples
8//!
9//! ```rust
10//! # use bms_table::fetch::{get_web_header_json_value, HeaderQueryContent};
11//! let html = r#"
12//! <!DOCTYPE html>
13//! <html>
14//! <head>
15//! <meta name="bmstable" content="header.json">
16//! </head>
17//! <body></body>
18//! </html>
19//! "#;
20//! match get_web_header_json_value::<serde_json::Value>(html).unwrap() {
21//! HeaderQueryContent::Url(u) => assert_eq!(u, "header.json"),
22//! _ => unreachable!(),
23//! }
24//! ```
25#![cfg(feature = "scraper")]
26
27pub mod reqwest;
28
29use std::future::Future;
30
31use anyhow::{Context, Result, anyhow};
32use scraper::{ElementRef, Html, Selector};
33use serde::de::DeserializeOwned;
34
35use crate::{BmsTable, BmsTableInfo, BmsTableRaw};
36
37/// Result of fetching a table with its raw JSON strings.
38pub struct FetchedTable {
39 /// Parsed table.
40 pub table: BmsTable,
41 /// Raw JSON strings and resolved URLs.
42 pub raw: BmsTableRaw,
43}
44
45/// Result of fetching a table list with its raw JSON string.
46pub struct FetchedTableList {
47 /// Parsed list entries.
48 pub tables: Vec<BmsTableInfo>,
49 /// Raw JSON string actually used for parsing.
50 pub raw_json: String,
51}
52
53/// Unified interface for fetching BMS tables.
54pub trait TableFetcher {
55 /// Fetch and parse a complete BMS difficulty table, including raw JSON strings.
56 ///
57 /// # Errors
58 ///
59 /// Returns an error if fetching or parsing the table fails.
60 fn fetch_table(
61 &self,
62 web_url: url::Url,
63 ) -> impl Future<Output = Result<FetchedTable>> + Send + '_;
64
65 /// Fetch a list of BMS difficulty tables, including the raw JSON string.
66 ///
67 /// # Errors
68 ///
69 /// Returns an error if fetching or parsing the list fails.
70 fn fetch_table_list(
71 &self,
72 web_url: url::Url,
73 ) -> impl Future<Output = Result<FetchedTableList>> + Send + '_;
74}
75
76/// Return type of [`get_web_header_json_value`].
77///
78/// - If the input is HTML, returns the URL extracted from `<meta name="bmstable">`;
79/// - If the input is JSON, returns the parsed value of type `T`.
80pub enum HeaderQueryContent<T> {
81 /// Extracted header JSON URL.
82 ///
83 /// May be relative or absolute; prefer using `url::Url::join` to resolve.
84 Url(String),
85 /// Parsed header JSON content.
86 Value(T),
87}
88
89/// Remove non-printable control characters from JSON text.
90///
91/// Rationale: some sites return JSON with illegal control characters surrounding it.
92/// Cleaning prior to parsing improves compatibility while not affecting preservation of raw text.
93#[must_use]
94pub fn replace_control_chars(s: &str) -> String {
95 s.chars().filter(|ch: &char| !ch.is_control()).collect()
96}
97
98/// Parse JSON from a raw string with a cleaning fallback.
99///
100/// Tries to deserialize from the original `raw` first. If it fails, removes illegal
101/// control characters using [`replace_control_chars`] and retries. Returns the parsed
102/// value and the raw string that was successfully used.
103///
104/// # Errors
105///
106/// Returns an error when both the original and cleaned strings fail to deserialize.
107pub fn parse_json_str_with_fallback<T: DeserializeOwned>(raw: &str) -> Result<(T, String)> {
108 match serde_json::from_str::<T>(raw) {
109 Ok(v) => Ok((v, raw.to_string())),
110 Err(_) => {
111 let cleaned = replace_control_chars(raw);
112 let v = serde_json::from_str::<T>(&cleaned)?;
113 Ok((v, cleaned))
114 }
115 }
116}
117
118/// Parse a response string into the header JSON or its URL.
119///
120/// Strategy: first attempt to parse as JSON; if it fails, parse as HTML and extract the bmstable URL.
121///
122/// # Returns
123///
124/// - `HeaderQueryContent::Value`: input is JSON;
125/// - `HeaderQueryContent::Url`: input is HTML.
126///
127/// # Errors
128///
129/// Returns an error when the input is HTML but the bmstable field cannot be found.
130pub fn get_web_header_json_value<T: DeserializeOwned>(
131 response_str: &str,
132) -> Result<HeaderQueryContent<T>> {
133 // First try parsing as JSON (remove illegal control characters before parsing); if it fails, treat as HTML and extract the bmstable URL
134 let cleaned = replace_control_chars(response_str);
135 match serde_json::from_str::<T>(&cleaned) {
136 Ok(header_json) => Ok(HeaderQueryContent::Value(header_json)),
137 Err(_) => {
138 let bmstable_url = try_extract_bmstable_from_html(response_str)
139 .context("When extracting bmstable url")?;
140 Ok(HeaderQueryContent::Url(bmstable_url))
141 }
142 }
143}
144
145/// Extract the header query content from a response string with a fallback cleaning step.
146///
147/// Attempts [`get_web_header_json_value`] on `raw` first; on failure, retries with
148/// a control-character-cleaned string via [`replace_control_chars`]. Returns the content
149/// and the raw string actually used for the successful extraction.
150///
151/// # Errors
152///
153/// Returns an error when both attempts fail to extract a header URL or parse JSON.
154pub fn header_query_with_fallback<T: DeserializeOwned>(
155 raw: &str,
156) -> Result<(HeaderQueryContent<T>, String)> {
157 match get_web_header_json_value::<T>(raw) {
158 Ok(v) => Ok((v, raw.to_string())),
159 Err(_) => {
160 let cleaned = replace_control_chars(raw);
161 let v = get_web_header_json_value::<T>(&cleaned)?;
162 Ok((v, cleaned))
163 }
164 }
165}
166
167/// Extract the JSON file URL pointed to by the bmstable field from HTML page content.
168///
169/// Scans `<meta>` tags looking for elements with `name="bmstable"` and reads their `content` attribute.
170///
171/// # Errors
172///
173/// Returns an error when the target tag is not found or `content` is empty.
174pub fn try_extract_bmstable_from_html(html_content: &str) -> Result<String> {
175 let document = Html::parse_document(html_content);
176 let meta_selector = Selector::parse("meta").map_err(|_| anyhow!("meta tag not found"))?;
177 let link_selector = Selector::parse("link").ok();
178 let a_selector = Selector::parse("a").ok();
179 let script_selector = Selector::parse("script").ok();
180
181 let find_attr = |selector: &Selector,
182 attr: &str,
183 keep: &mut dyn FnMut(&ElementRef<'_>, &str) -> bool|
184 -> Option<String> {
185 for element in document.select(selector) {
186 if let Some(value) = element.value().attr(attr)
187 && keep(&element, value)
188 {
189 return Some(value.to_string());
190 }
191 }
192 None
193 };
194
195 let candidate = meta_bmstable(&document, &meta_selector)
196 .or_else(|| {
197 let mut keep = |element: &ElementRef<'_>, href: &str| {
198 element
199 .value()
200 .attr("rel")
201 .is_some_and(|v| v.eq_ignore_ascii_case("bmstable"))
202 && !href.is_empty()
203 };
204 link_selector
205 .as_ref()
206 .and_then(|sel| find_attr(sel, "href", &mut keep))
207 })
208 .or_else(|| {
209 let mut keep = |_: &ElementRef<'_>, href: &str| contains_header_json(href);
210 a_selector
211 .as_ref()
212 .and_then(|sel| find_attr(sel, "href", &mut keep))
213 })
214 .or_else(|| {
215 let mut keep = |_: &ElementRef<'_>, href: &str| contains_header_json(href);
216 link_selector
217 .as_ref()
218 .and_then(|sel| find_attr(sel, "href", &mut keep))
219 })
220 .or_else(|| {
221 let mut keep = |_: &ElementRef<'_>, src: &str| contains_header_json(src);
222 script_selector
223 .as_ref()
224 .and_then(|sel| find_attr(sel, "src", &mut keep))
225 })
226 .or_else(|| {
227 let mut keep = |_: &ElementRef<'_>, content: &str| contains_header_json(content);
228 find_attr(&meta_selector, "content", &mut keep)
229 })
230 .or_else(|| {
231 find_header_json_in_text(html_content)
232 .map(|(start, end)| html_content[start..end].to_string())
233 });
234
235 candidate.map_or_else(
236 || Err(anyhow!("bmstable field or header JSON hint not found")),
237 Ok,
238 )
239}
240
241/// Find the start and end indices of a substring like "*header*.json" in raw text.
242fn find_header_json_in_text(s: &str) -> Option<(usize, usize)> {
243 let lower = s.to_ascii_lowercase();
244 let mut pos = 0;
245 while let Some(idx) = lower[pos..].find("header") {
246 let global_idx = pos + idx;
247 // Look for .json after header
248 if let Some(json_rel) = lower[global_idx..].find(".json") {
249 let end = global_idx + json_rel + ".json".len();
250 // Try to find the nearest quote or whitespace before as the start
251 let start = lower[..global_idx]
252 .rfind(|c: char| c == '"' || c == '\'' || c.is_whitespace())
253 .map(|i| i + 1)
254 .unwrap_or(global_idx);
255 if end > start {
256 return Some((start, end));
257 }
258 }
259 pos = global_idx + 6; // skip "header"
260 }
261 None
262}
263
264/// Check whether the string contains "header" and ends with ".json".
265fn contains_header_json(s: &str) -> bool {
266 let ls = s.to_ascii_lowercase();
267 ls.contains("header") && ls.ends_with(".json")
268}
269
270/// Extract bmstable content from `<meta>` tags.
271fn meta_bmstable(document: &Html, meta_selector: &Selector) -> Option<String> {
272 for element in document.select(meta_selector) {
273 let is_bmstable = element
274 .value()
275 .attr("name")
276 .is_some_and(|v| v.eq_ignore_ascii_case("bmstable"))
277 || element
278 .value()
279 .attr("property")
280 .is_some_and(|v| v.eq_ignore_ascii_case("bmstable"));
281 if is_bmstable
282 && let Some(content_attr) = element.value().attr("content")
283 && !content_attr.is_empty()
284 {
285 return Some(content_attr.to_string());
286 }
287 }
288 None
289}