bms_table/
fetch.rs

1//! Data fetching and HTML parsing helpers
2//!
3//! Provides HTML parsing when the `scraper` feature is enabled, used to extract the header JSON URL from
4//! `<meta name="bmstable" content="...">` in a page.
5//! Also provides a unified entry to parse a response string into the header JSON or its URL.
6//!
7//! # Examples
8//!
9//! ```rust
10//! # use bms_table::fetch::{get_web_header_json_value, HeaderQueryContent};
11//! let html = r#"
12//! <!DOCTYPE html>
13//! <html>
14//!   <head>
15//!     <meta name="bmstable" content="header.json">
16//!   </head>
17//!   <body></body>
18//! </html>
19//! "#;
20//! match get_web_header_json_value::<serde_json::Value>(html).unwrap() {
21//!     HeaderQueryContent::Url(u) => assert_eq!(u, "header.json"),
22//!     _ => unreachable!(),
23//! }
24//! ```
25#![cfg(feature = "scraper")]
26
27pub mod reqwest;
28
29use anyhow::{Context, Result, anyhow};
30use scraper::{Html, Selector};
31use serde::de::DeserializeOwned;
32
33/// Return type of [`get_web_header_json_value`].
34///
35/// - If the input is HTML, returns the URL extracted from `<meta name="bmstable">`;
36/// - If the input is JSON, returns the parsed value of type `T`.
37pub enum HeaderQueryContent<T> {
38    /// Extracted header JSON URL.
39    ///
40    /// May be relative or absolute; prefer using `url::Url::join` to resolve.
41    Url(String),
42    /// Parsed header JSON content.
43    Value(T),
44}
45
46/// Remove non-printable control characters from JSON text (preserves `\n`, `\r`, `\t`).
47///
48/// Rationale: some sites return JSON with illegal control characters surrounding it.
49/// Cleaning prior to parsing improves compatibility while not affecting preservation of raw text.
50pub(crate) fn replace_control_chars(s: &str) -> String {
51    s.chars().filter(|ch: &char| !ch.is_control()).collect()
52}
53
54/// Parse a response string into the header JSON or its URL.
55///
56/// Strategy: first attempt to parse as JSON; if it fails, parse as HTML and extract the bmstable URL.
57///
58/// # Returns
59///
60/// - `HeaderQueryContent::Json`: input is JSON;
61/// - `HeaderQueryContent::Url`: input is HTML.
62///
63/// # Errors
64///
65/// Returns an error when the input is HTML but the bmstable field cannot be found.
66pub fn get_web_header_json_value<T: DeserializeOwned>(
67    response_str: &str,
68) -> anyhow::Result<HeaderQueryContent<T>> {
69    // First try parsing as JSON (remove illegal control characters before parsing); if it fails, treat as HTML and extract the bmstable URL
70    let cleaned = replace_control_chars(response_str);
71    match serde_json::from_str::<T>(&cleaned) {
72        Ok(header_json) => Ok(HeaderQueryContent::Value(header_json)),
73        Err(_) => {
74            let bmstable_url =
75                extract_bmstable_url(response_str).context("When extracting bmstable url")?;
76            Ok(HeaderQueryContent::Url(bmstable_url))
77        }
78    }
79}
80
81/// Extract the JSON file URL pointed to by the bmstable field from HTML page content.
82///
83/// Scans `<meta>` tags looking for elements with `name="bmstable"` and reads their `content` attribute.
84///
85/// # Errors
86///
87/// Returns an error when the target tag is not found or `content` is empty.
88pub fn extract_bmstable_url(html_content: &str) -> Result<String> {
89    let document = Html::parse_document(html_content);
90
91    // Find all meta tags
92    let Ok(meta_selector) = Selector::parse("meta") else {
93        return Err(anyhow!("meta tag not found"));
94    };
95
96    // 1) Prefer extracting from <meta name="bmstable" content="..."> or <meta property="bmstable">
97    for element in document.select(&meta_selector) {
98        // Tags whose name or property is bmstable
99        let is_bmstable = element
100            .value()
101            .attr("name")
102            .is_some_and(|v| v.eq_ignore_ascii_case("bmstable"))
103            || element
104                .value()
105                .attr("property")
106                .is_some_and(|v| v.eq_ignore_ascii_case("bmstable"));
107        if is_bmstable
108            && let Some(content_attr) = element.value().attr("content")
109            && !content_attr.is_empty()
110        {
111            return Ok(content_attr.to_string());
112        }
113    }
114
115    // 2) Next, try <link rel="bmstable" href="...json">
116    if let Ok(link_selector) = Selector::parse("link") {
117        for element in document.select(&link_selector) {
118            let rel = element.value().attr("rel");
119            let href = element.value().attr("href");
120            if rel.is_some_and(|v| v.eq_ignore_ascii_case("bmstable"))
121                && let Some(href) = href
122                && !href.is_empty()
123            {
124                return Ok(href.to_string());
125            }
126        }
127    }
128
129    // 3) Then try to find clues for *header*.json in common tag attributes
130    //    - a[href], link[href], script[src], meta[content]
131    let lower_contains_header_json = |s: &str| {
132        let ls = s.to_ascii_lowercase();
133        ls.contains("header") && ls.ends_with(".json")
134    };
135
136    // a[href]
137    if let Ok(a_selector) = Selector::parse("a") {
138        for element in document.select(&a_selector) {
139            if let Some(href) = element.value().attr("href")
140                && lower_contains_header_json(href)
141            {
142                return Ok(href.to_string());
143            }
144        }
145    }
146
147    // link[href]
148    if let Ok(link_selector) = Selector::parse("link") {
149        for element in document.select(&link_selector) {
150            if let Some(href) = element.value().attr("href")
151                && lower_contains_header_json(href)
152            {
153                return Ok(href.to_string());
154            }
155        }
156    }
157
158    // script[src]
159    if let Ok(script_selector) = Selector::parse("script") {
160        for element in document.select(&script_selector) {
161            if let Some(src) = element.value().attr("src")
162                && lower_contains_header_json(src)
163            {
164                return Ok(src.to_string());
165            }
166        }
167    }
168
169    // meta[content]
170    for element in document.select(&meta_selector) {
171        if let Some(content_attr) = element.value().attr("content")
172            && lower_contains_header_json(content_attr)
173        {
174            return Ok(content_attr.to_string());
175        }
176    }
177
178    // 4) Finally, a minimal heuristic search on raw text: match substrings containing "header" and ending with .json
179    if let Some((start, end)) = find_header_json_in_text(html_content) {
180        let candidate = &html_content[start..end];
181        return Ok(candidate.to_string());
182    }
183
184    Err(anyhow!("bmstable field or header JSON hint not found"))
185}
186
187/// Find a substring like "*header*.json" in raw text, returning start/end indices if found.
188fn find_header_json_in_text(s: &str) -> Option<(usize, usize)> {
189    let lower = s.to_ascii_lowercase();
190    let mut pos = 0;
191    while let Some(idx) = lower[pos..].find("header") {
192        let global_idx = pos + idx;
193        // Look for .json after header
194        if let Some(json_rel) = lower[global_idx..].find(".json") {
195            let end = global_idx + json_rel + ".json".len();
196            // Try to find the nearest quote or whitespace before as the start
197            let start = lower[..global_idx]
198                .rfind(|c: char| c == '"' || c == '\'' || c.is_whitespace())
199                .map(|i| i + 1)
200                .unwrap_or(global_idx);
201            if end > start {
202                return Some((start, end));
203            }
204        }
205        pos = global_idx + 6; // skip "header"
206    }
207    None
208}