use std::fmt::Write as _;
use tracing::{debug, warn};
use crate::{ContentType, Error, Fetcher, Result, SourceVariant};
#[derive(Debug, Clone)]
pub struct ResolvedUrl {
pub final_url: String,
pub variant: SourceVariant,
pub content_type: ContentType,
pub line_count: usize,
pub should_warn: bool,
}
pub async fn resolve_best_url(fetcher: &Fetcher, base_url: &str) -> Result<ResolvedUrl> {
let variants = [
try_full_variant(base_url),
Some(base_url.to_string()),
try_base_variant(base_url),
];
let mut blocked_statuses: Vec<u16> = Vec::new();
let mut rejected_statuses: Vec<u16> = Vec::new();
for (idx, maybe_url) in variants.iter().enumerate() {
if let Some(url) = maybe_url {
let should_fetch = match fetcher.head_metadata(url).await {
Ok(head_info) => {
let status = head_info.status;
if (200..=399).contains(&status) {
true
} else if status == 405 || status == 501 {
warn!(
%status,
%url,
"HEAD not supported; falling back to GET for candidate URL"
);
true
} else {
if matches!(status, 401 | 403 | 429) {
blocked_statuses.push(status);
} else {
rejected_statuses.push(status);
}
debug!(
%status,
%url,
"HEAD preflight rejected candidate URL"
);
false
}
},
Err(err) => {
debug!(error = %err, %url, "HEAD preflight failed for candidate URL");
false
},
};
if !should_fetch {
continue;
}
let (content, _sha256) = match fetcher.fetch(url).await {
Ok(result) => result,
Err(err) => {
debug!(error = %err, %url, "GET fallback failed for candidate URL");
continue;
},
};
let line_count = content.lines().count();
let (content_type, should_warn) = classify_content(line_count);
let variant = match idx {
0 => SourceVariant::LlmsFull,
1 => {
if base_url.ends_with("llms-full.txt") {
SourceVariant::LlmsFull
} else if base_url.ends_with("llms.txt") {
SourceVariant::Llms
} else {
SourceVariant::Custom
}
},
2 => SourceVariant::Llms,
_ => SourceVariant::Custom,
};
return Ok(ResolvedUrl {
final_url: url.clone(),
variant,
content_type,
line_count,
should_warn,
});
}
}
Err(Error::NotFound(build_resolution_error_message(
base_url,
&blocked_statuses,
&rejected_statuses,
)))
}
fn build_resolution_error_message(
base_url: &str,
blocked_statuses: &[u16],
rejected_statuses: &[u16],
) -> String {
let mut message = String::new();
let _ = write!(
message,
"Failed to resolve any llms.txt variant for '{base_url}'. \
Tried: llms-full.txt, exact URL, and llms.txt fallback."
);
if !blocked_statuses.is_empty() {
message.push_str(" HEAD returned ");
write_status_list(&mut message, blocked_statuses);
message.push_str(
"; the host may be blocking automated requests. \
Try a direct llms.txt/llms-full.txt URL or add a local file via a manifest.",
);
} else if !rejected_statuses.is_empty() {
message.push_str(" HEAD returned non-success status codes (");
write_status_list(&mut message, rejected_statuses);
message.push_str(").");
}
message
}
fn write_status_list(message: &mut String, statuses: &[u16]) {
for (idx, status) in statuses.iter().enumerate() {
if idx > 0 {
message.push_str(", ");
}
let _ = write!(message, "{status}");
}
}
fn try_full_variant(url: &str) -> Option<String> {
if url.ends_with("llms.txt") && !url.ends_with("llms-full.txt") {
Some(url.replace("llms.txt", "llms-full.txt"))
} else {
None
}
}
fn try_base_variant(url: &str) -> Option<String> {
if url.ends_with("llms-full.txt") {
Some(url.replace("llms-full.txt", "llms.txt"))
} else {
None
}
}
const fn classify_content(line_count: usize) -> (ContentType, bool) {
if line_count > 1000 {
(ContentType::Full, false)
} else if line_count < 100 {
(ContentType::Index, true)
} else {
(ContentType::Mixed, false)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_try_full_variant() {
assert_eq!(
try_full_variant("https://example.com/llms.txt"),
Some("https://example.com/llms-full.txt".to_string())
);
assert_eq!(try_full_variant("https://example.com/llms-full.txt"), None);
assert_eq!(try_full_variant("https://example.com/docs.txt"), None);
}
#[test]
fn test_try_base_variant() {
assert_eq!(
try_base_variant("https://example.com/llms-full.txt"),
Some("https://example.com/llms.txt".to_string())
);
assert_eq!(try_base_variant("https://example.com/llms.txt"), None);
assert_eq!(try_base_variant("https://example.com/docs.txt"), None);
}
}