use anyhow::{Context, Result};
use log::{debug, error, info};
use reqwest::Url;
use serde::Deserialize;
use crate::api::client;
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub enum SearchSource {
Remote,
Local,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct SearchItem {
pub source: SearchSource,
pub package: String,
pub item_type: String,
pub title: String,
pub parent_title: String,
pub doc_text: String,
pub ref_url: String,
}
#[derive(Deserialize)]
struct RawSearchItem {
#[serde(rename = "type")]
item_type: String,
title: String,
#[serde(rename = "parentTitle", default)]
parent_title: String,
#[serde(default)]
doc: String,
#[serde(rename = "ref")]
ref_url: String,
}
#[derive(Deserialize)]
struct SearchData {
items: Vec<RawSearchItem>,
}
pub(crate) fn strip_html(s: &str) -> String {
let tag_stripped = strip_html_tags(s);
decode_html_entities(&tag_stripped)
}
fn strip_html_tags(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut in_tag = false;
for c in s.chars() {
match c {
'<' => in_tag = true,
'>' => in_tag = false,
_ if !in_tag => out.push(c),
_ => {}
}
}
out
}
fn decode_html_entities(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut rest = s;
while let Some(amp) = rest.find('&') {
out.push_str(&rest[..amp]);
rest = &rest[amp..];
if let Some(semi) = rest.find(';') {
let entity = &rest[..=semi]; let decoded = decode_entity(entity);
out.push_str(decoded.as_deref().unwrap_or(entity));
rest = &rest[semi + 1..];
} else {
out.push('&');
rest = &rest[1..];
}
}
out.push_str(rest);
out
}
fn decode_entity(entity: &str) -> Option<String> {
match entity {
"&" => Some("&".into()),
"<" => Some("<".into()),
">" => Some(">".into()),
""" => Some("\"".into()),
"'" | "'" => Some("'".into()),
" " => Some(" ".into()),
"—" | "—" | "—" => Some("—".into()),
"–" | "–" | "–" => Some("–".into()),
"…" | "…" | "…" => Some("…".into()),
_ if entity.starts_with("&#x") || entity.starts_with("&#X") => {
let hex = entity
.trim_start_matches("&#x")
.trim_start_matches("&#X")
.trim_end_matches(';');
u32::from_str_radix(hex, 16)
.ok()
.and_then(char::from_u32)
.map(|c| c.to_string())
}
_ if entity.starts_with("&#") => {
let dec = entity.trim_start_matches("&#").trim_end_matches(';');
dec.parse::<u32>()
.ok()
.and_then(char::from_u32)
.map(|c| c.to_string())
}
_ => None,
}
}
pub async fn fetch_docs_search_data(package: &str) -> Result<Vec<SearchItem>> {
let c = client()?;
let base = format!("https://hexdocs.pm/{package}/");
info!("[docs] fetch_docs_search_data package={package}");
const DIRECT: &[&str] = &[
"search_data.json",
"search-data.json",
"search_data.js",
"search-data.js",
"dist/search_data.js",
"dist/search-data.js",
];
for candidate in DIRECT {
let url = format!("{base}{candidate}");
let resp = match c.get(&url).send().await {
Ok(r) => r,
Err(e) => {
debug!("[docs] {url} request error: {e}");
continue;
}
};
if !resp.status().is_success() {
debug!("[docs] {url} → {}", resp.status());
continue;
}
debug!("[docs] {url} → 200, parsing");
let body = resp.text().await.unwrap_or_default();
match parse_search_data(&body) {
Ok(data) if !data.items.is_empty() => {
info!(
"[docs] resolved via direct candidate {url} items={}",
data.items.len()
);
return Ok(map_search_items(data, package));
}
Ok(_) => debug!("[docs] {url} parsed but empty, continuing"),
Err(e) => debug!("[docs] {url} parse failed ({e}), continuing"),
}
}
let search_html_url = format!("{base}search.html");
info!("[docs] direct candidates exhausted, trying HTML discovery at {search_html_url}");
let page_resp = c.get(&search_html_url).send().await;
let page_body = match page_resp {
Ok(r) if r.status().is_success() => r.text().await.unwrap_or_default(),
Ok(r) => {
debug!("[docs] search.html → {}", r.status());
String::new()
}
Err(e) => {
debug!("[docs] search.html request error: {e}");
String::new()
}
};
if !page_body.is_empty() {
if let Some(asset_url) = find_search_index_url(&search_html_url, &page_body) {
info!("[docs] HTML discovery found asset {asset_url}");
match c.get(&asset_url).send().await {
Ok(r) if r.status().is_success() => {
let body = r.text().await.unwrap_or_default();
match parse_search_data(&body) {
Ok(data) if !data.items.is_empty() => {
info!(
"[docs] resolved via HTML asset {asset_url} items={}",
data.items.len()
);
return Ok(map_search_items(data, package));
}
Ok(_) => debug!("[docs] HTML asset empty"),
Err(e) => debug!("[docs] HTML asset parse failed: {e}"),
}
}
Ok(r) => debug!("[docs] HTML asset → {}", r.status()),
Err(e) => debug!("[docs] HTML asset request error: {e}"),
}
}
if page_body.contains("searchData") {
match parse_search_data(&page_body) {
Ok(data) if !data.items.is_empty() => {
info!(
"[docs] resolved via inline searchData items={}",
data.items.len()
);
return Ok(map_search_items(data, package));
}
Ok(_) => debug!("[docs] inline searchData empty"),
Err(e) => debug!("[docs] inline searchData parse failed: {e}"),
}
}
}
error!("[docs] all strategies failed for package={package}");
Err(anyhow::anyhow!(
"docs search index not found for '{package}' — tried direct candidates and HTML discovery"
))
}
fn map_search_items(data: SearchData, package: &str) -> Vec<SearchItem> {
data.items
.into_iter()
.map(|r| SearchItem {
source: SearchSource::Remote,
package: package.to_string(),
item_type: r.item_type,
title: r.title,
parent_title: r.parent_title,
doc_text: strip_html(&r.doc),
ref_url: r.ref_url,
})
.collect()
}
fn parse_search_data(body: &str) -> Result<SearchData> {
let body = body.trim();
if body.starts_with('{') {
return serde_json::from_str(body).context("parsing docs search data as JSON");
}
if let Some(kw_pos) = body.find("searchData") {
let after_kw = body[kw_pos + "searchData".len()..].trim_start();
if let Some(eq_pos) = after_kw.find('=') {
let json_part = after_kw[eq_pos + 1..].trim_start();
if json_part.starts_with('{') {
let json_text = extract_json_object(json_part);
return serde_json::from_str(json_text)
.context("parsing docs search data from JS assignment");
}
}
}
Err(anyhow::anyhow!(
"no parseable search data found in response"
))
}
fn extract_json_object(s: &str) -> &str {
let mut depth = 0;
let mut in_string = false;
let mut escape = false;
for (idx, c) in s.char_indices() {
if in_string {
if escape {
escape = false;
} else if c == '\\' {
escape = true;
} else if c == '"' {
in_string = false;
}
} else {
match c {
'{' => depth += 1,
'}' => {
depth -= 1;
if depth == 0 {
return &s[..=idx];
}
}
'"' => in_string = true,
_ => {}
}
}
}
s.trim_end_matches(';').trim()
}
fn find_search_index_url(base_url: &str, html: &str) -> Option<String> {
let base = Url::parse(base_url).ok()?;
let candidates = [
"dist/search_data-",
"dist/search-data-",
"search_data.json",
"search-data.json",
"search_data.js",
"search-data.js",
"dist/search_data.js",
"dist/search-data.js",
"search.json",
"search-index.json",
];
let mut start = 0;
while let Some(src_pos) = html[start..].find("src=") {
let src_pos = start + src_pos + "src=".len();
let trimmed = html[src_pos..].trim_start();
let quote = match trimmed.chars().next() {
Some(q) if q == '"' || q == '\'' => q,
_ => {
start = src_pos;
continue;
}
};
let trimmed = &trimmed[1..];
if let Some(end_quote) = trimmed.find(quote) {
let path = &trimmed[..end_quote];
if candidates.iter().any(|c| path.contains(c)) {
if let Ok(url) = base.join(path) {
return Some(url.into());
}
}
start = src_pos + 1 + end_quote;
} else {
break;
}
}
for candidate in candidates {
let mut start = 0;
while let Some(pos) = html[start..].find(candidate) {
let pos = start + pos;
if let Some(path) = extract_quoted_path(html, pos) {
if let Ok(url) = base.join(&path) {
return Some(url.into());
}
}
start = pos + candidate.len();
}
}
None
}
fn extract_quoted_path(html: &str, pos: usize) -> Option<String> {
let before = &html[..pos];
let quote_pos = before.rfind(['"', '\''])?;
let quote = html.chars().nth(quote_pos)?;
let suffix = &html[pos..];
let end = suffix.find(quote)?;
Some(html[quote_pos + 1..pos + end].to_string())
}
#[cfg(test)]
mod tests {
use super::*;
fn item(type_: &str, title: &str, ref_: &str) -> String {
format!(
r#"{{"type":"{type_}","title":"{title}","parentTitle":"","doc":"","ref":"{ref_}"}}"#
)
}
fn wrap(items: &str) -> String {
format!(r#"{{"items":[{items}]}}"#)
}
#[test]
fn parse_pure_json() {
let json = wrap(&item("function", "add/2", "Math.html#add/2"));
let data = parse_search_data(&json).unwrap();
assert_eq!(data.items.len(), 1);
assert_eq!(data.items[0].title, "add/2");
}
#[test]
fn parse_pure_json_with_equals_in_values() {
let json = wrap(&item("function", "query_param=value", "Mod.html#f/1"));
let data = parse_search_data(&json).unwrap();
assert_eq!(data.items.len(), 1);
assert_eq!(data.items[0].title, "query_param=value");
}
#[test]
fn parse_js_assignment() {
let body = format!(
r#"var searchData={};"#,
wrap(&item("module", "MyMod", "MyMod.html"))
);
let data = parse_search_data(&body).unwrap();
assert_eq!(data.items.len(), 1);
assert_eq!(data.items[0].title, "MyMod");
}
#[test]
fn parse_js_assignment_with_spaces() {
let body = format!(r#"searchData = {};"#, wrap(&item("type", "T", "T.html")));
let data = parse_search_data(&body).unwrap();
assert_eq!(data.items.len(), 1);
}
#[test]
fn find_url_from_script_src_json() {
let html = r#"<script src="search_data.json"></script>"#;
let url = find_search_index_url("https://hexdocs.pm/gleam_stdlib/search.html", html);
assert_eq!(
url.as_deref(),
Some("https://hexdocs.pm/gleam_stdlib/search_data.json")
);
}
#[test]
fn find_url_from_script_src_js() {
let html = r#"<script defer="defer" src="dist/search_data-abc123.js"></script>"#;
let url = find_search_index_url("https://hexdocs.pm/lustre/search.html", html);
assert!(url.is_some());
assert!(url.unwrap().contains("dist/search_data-abc123.js"));
}
#[test]
fn find_url_prefers_src_attribute() {
let html = r#"<link href="search-data.json"><script src="search-data.js"></script>"#;
let url = find_search_index_url("https://hexdocs.pm/pkg/search.html", html);
assert_eq!(
url.as_deref(),
Some("https://hexdocs.pm/pkg/search-data.js")
);
}
#[test]
fn find_url_returns_none_for_unrelated_html() {
let html = r#"<html><body><p>No search here.</p></body></html>"#;
let url = find_search_index_url("https://hexdocs.pm/pkg/search.html", html);
assert!(url.is_none());
}
#[test]
fn map_search_items_sets_source_and_package() {
let json = wrap(&item("function", "add/2", "Math.html#add/2"));
let data = parse_search_data(&json).unwrap();
let items = map_search_items(data, "my_pkg");
assert_eq!(items.len(), 1);
assert_eq!(items[0].source, SearchSource::Remote);
assert_eq!(items[0].package, "my_pkg");
assert_eq!(items[0].title, "add/2");
}
#[test]
fn search_item_serialises_with_source_and_package() {
let si = SearchItem {
source: SearchSource::Remote,
package: "gleam_stdlib".into(),
item_type: "module".into(),
title: "List".into(),
parent_title: "".into(),
doc_text: "List utilities.".into(),
ref_url: "list.html".into(),
};
let json = serde_json::to_string(&si).unwrap();
assert!(json.contains("\"source\":\"Remote\""));
assert!(json.contains("\"package\":\"gleam_stdlib\""));
let roundtrip: SearchItem = serde_json::from_str(&json).unwrap();
assert_eq!(roundtrip.source, SearchSource::Remote);
assert_eq!(roundtrip.package, "gleam_stdlib");
}
#[test]
fn strip_html_removes_tags() {
assert_eq!(strip_html("<p>hello</p>"), "hello");
assert_eq!(strip_html("<em>foo</em> bar"), "foo bar");
assert_eq!(strip_html("no tags here"), "no tags here");
}
#[test]
fn strip_html_decodes_named_entities() {
assert_eq!(strip_html("a & b"), "a & b");
assert_eq!(strip_html("a < b > c"), "a < b > c");
assert_eq!(strip_html("say "hi""), "say \"hi\"");
assert_eq!(strip_html("it's"), "it's");
assert_eq!(strip_html("foo bar"), "foo bar");
}
#[test]
fn strip_html_decodes_dash_entities() {
assert_eq!(strip_html("a—b"), "a—b");
assert_eq!(strip_html("a–b"), "a–b");
assert_eq!(strip_html("wait…"), "wait…");
}
#[test]
fn strip_html_decodes_decimal_numeric_entities() {
assert_eq!(strip_html("A"), "A");
assert_eq!(strip_html("—"), "—");
assert_eq!(strip_html("'"), "'");
}
#[test]
fn strip_html_decodes_hex_numeric_entities() {
assert_eq!(strip_html("A"), "A");
assert_eq!(strip_html("—"), "—");
assert_eq!(strip_html("…"), "…");
}
#[test]
fn strip_html_handles_amp_without_semicolon() {
assert_eq!(strip_html("foo & bar"), "foo & bar");
}
#[test]
fn strip_html_combined_tags_and_entities() {
let input = "<p>foo & <em>bar</em> <baz></p>";
assert_eq!(strip_html(input), "foo & bar <baz>");
}
}