use std::collections::HashMap;
use anyhow::{Context, Result};
use regex::Regex;
use super::config::{ConcurrentFetchConfig, JsonConfig};
use super::json_path;
use crate::http_client::AcceleratedClient;
use crate::site::Engagement;
pub(super) fn parse_response_json(body: &str, api_url: &str) -> Result<serde_json::Value> {
serde_json::from_str(body).with_context(|| format!("failed to parse JSON from '{api_url}'"))
}
#[cfg(test)]
pub(super) fn json_path_is_non_null(json: &serde_json::Value, path: &str) -> bool {
json_path::is_non_null(json, path)
}
pub(super) async fn fetch_and_extract_json(
client: &AcceleratedClient,
url: &str,
accept: Option<&str>,
json_config: &JsonConfig,
cookies: Option<&str>,
) -> Result<HashMap<String, String>> {
let mut request = client.inner().get(url);
if let Some(accept_val) = accept {
request = request.header(reqwest::header::ACCEPT, accept_val);
}
if let Some(cookie_val) = cookies {
request = request.header(reqwest::header::COOKIE, cookie_val);
}
let body = request
.send()
.await
.with_context(|| format!("additional fetch request failed for '{url}'"))?
.error_for_status()
.with_context(|| format!("HTTP error for additional fetch '{url}'"))?
.text()
.await
.with_context(|| format!("failed to read additional fetch body from '{url}'"))?;
let json: serde_json::Value = serde_json::from_str(&body)
.with_context(|| format!("failed to parse JSON from additional fetch '{url}'"))?;
let fields = json_config
.0
.iter()
.filter_map(|(name, path)| {
let value = if path.contains("[]") {
let arr = json_path::extract_array(&json, path);
if arr.is_empty() {
return None;
}
arr.join(", ")
} else {
json_path::extract(&json, path)?
};
Some((name.clone(), value))
})
.collect();
Ok(fields)
}
pub(super) async fn fetch_and_expand_items(
client: &AcceleratedClient,
url: &str,
cf: &ConcurrentFetchConfig,
cookies: Option<&str>,
) -> Result<HashMap<String, String>> {
let mut request = client.inner().get(url);
if let Some(accept_val) = &cf.accept {
request = request.header(reqwest::header::ACCEPT, accept_val.as_str());
}
if let Some(cookie_val) = cookies {
request = request.header(reqwest::header::COOKIE, cookie_val);
}
let body = request
.send()
.await
.with_context(|| format!("concurrent fetch failed for '{url}'"))?
.error_for_status()
.with_context(|| format!("HTTP error for concurrent fetch '{url}'"))?
.text()
.await
.with_context(|| format!("failed to read concurrent fetch body from '{url}'"))?;
let json: serde_json::Value = serde_json::from_str(&body)
.with_context(|| format!("failed to parse JSON from concurrent fetch '{url}'"))?;
let items_array = extract_items_array(&json, &cf.items_path)?;
let limit = cf.item_limit();
let mut fields = HashMap::new();
for (idx, item) in items_array.iter().take(limit).enumerate() {
for (field_name, path) in &cf.json.0 {
if let Some(value) = json_path::extract(item, path) {
fields.insert(format!("{}_{}_{}", cf.prefix, idx, field_name), value);
}
}
}
Ok(fields)
}
pub(super) fn extract_items_array<'a>(
json: &'a serde_json::Value,
items_path: &str,
) -> Result<&'a Vec<serde_json::Value>> {
let path = items_path.trim_start_matches('.');
let target = if path.is_empty() {
json
} else {
let mut current = json;
for segment in path.split('.') {
current = current.get(segment).with_context(|| {
format!("items_path segment '{segment}' not found in JSON response")
})?;
}
current
};
target
.as_array()
.with_context(|| format!("items_path '{items_path}' did not resolve to a JSON array"))
}
pub(super) fn rewrite_url_with(re: &Regex, to: &str, url: &str) -> String {
if to.contains("{url}") {
return to.replace("{url}", &urlencoding::encode(url));
}
re.replace(url, to).into_owned()
}
pub(super) fn extract_css_fields(
html: &str,
css_map: &HashMap<String, String>,
) -> HashMap<String, String> {
use scraper::{Html, Selector};
let document = Html::parse_document(html);
let mut fields = HashMap::new();
for (field_name, selector_str) in css_map {
let (selector_str, attr_name) = parse_css_attr_suffix(selector_str);
let Ok(selector) = Selector::parse(selector_str) else {
tracing::warn!("Invalid CSS selector for field '{field_name}': '{selector_str}'");
continue;
};
let Some(element) = document.select(&selector).next() else {
continue;
};
let value = if let Some(attr) = attr_name {
element.value().attr(attr).unwrap_or("").to_string()
} else {
element.text().collect::<String>()
};
if !value.is_empty() {
fields.insert(field_name.clone(), value);
}
}
fields
}
pub(super) fn parse_css_attr_suffix(selector: &str) -> (&str, Option<&str>) {
let Some(attr_start) = selector.rfind("::attr(") else {
return (selector, None);
};
let rest = &selector[attr_start + 7..]; let Some(close) = rest.rfind(')') else {
return (selector, None);
};
let attr_name = &rest[..close];
let css_part = &selector[..attr_start];
(css_part, Some(attr_name))
}
pub(super) fn intern_name(s: &str) -> &'static str {
use std::collections::HashSet;
use std::sync::LazyLock;
use std::sync::Mutex;
static INTERNED: LazyLock<Mutex<HashSet<&'static str>>> =
LazyLock::new(|| Mutex::new(HashSet::new()));
let mut set = INTERNED.lock().expect("intern_name lock");
if let Some(&existing) = set.get(s) {
return existing;
}
let leaked: &'static str = Box::leak(s.to_string().into_boxed_str());
set.insert(leaked);
leaked
}
pub(super) fn build_engagement(
eng: &super::config::EngagementConfig,
fields: &HashMap<String, String>,
) -> Option<Engagement> {
let likes = eng.likes.as_deref().and_then(|f| parse_u64(fields.get(f)?));
let reposts = eng
.reposts
.as_deref()
.and_then(|f| parse_u64(fields.get(f)?));
let replies = eng
.replies
.as_deref()
.and_then(|f| parse_u64(fields.get(f)?));
let views = eng.views.as_deref().and_then(|f| parse_u64(fields.get(f)?));
if likes.is_none() && reposts.is_none() && replies.is_none() && views.is_none() {
None
} else {
Some(Engagement {
likes,
reposts,
replies,
views,
})
}
}
pub(super) fn parse_u64(s: &str) -> Option<u64> {
if let Ok(n) = s.parse::<u64>() {
return Some(n);
}
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
s.parse::<f64>().ok().map(|f| f as u64)
}