use crate::client::FetchOptions;
use crate::error::FetchError;
use crate::fetchers::default::{
apply_bot_auth_if_enabled, read_body_with_timeout, send_request_following_redirects,
BODY_TIMEOUT, DEFAULT_MAX_BODY_SIZE, TRUNCATION_MESSAGE,
};
use crate::fetchers::Fetcher;
use crate::types::{FetchRequest, FetchResponse};
use crate::DEFAULT_USER_AGENT;
use async_trait::async_trait;
use reqwest::header::{HeaderMap, HeaderValue, ACCEPT, USER_AGENT};
use std::time::Duration;
use url::Url;
const PROBE_TIMEOUT: Duration = Duration::from_secs(10);
const MAX_LLMS_TXT_SIZE: usize = 2 * 1024 * 1024;
const DOCS_HOSTS: &[&str] = &[
".readthedocs.io",
".readthedocs.org",
"docs.rs",
".gitbook.io",
".netlify.app", ".vercel.app", ];
const DOCS_HOST_PREFIXES: &[&str] = &["docs.", "wiki.", "developer.", "devdocs."];
pub struct DocsSiteFetcher;
impl DocsSiteFetcher {
pub fn new() -> Self {
Self
}
fn is_llms_txt_url(url: &Url) -> bool {
let path = url.path();
path == "/llms.txt" || path == "/llms-full.txt"
}
fn is_docs_site(url: &Url) -> bool {
let Some(host) = url.host_str() else {
return false;
};
let host = host.to_ascii_lowercase();
for suffix in DOCS_HOSTS {
if host.ends_with(suffix) {
return true;
}
}
for prefix in DOCS_HOST_PREFIXES {
if host.starts_with(prefix) {
return true;
}
}
false
}
}
impl Default for DocsSiteFetcher {
fn default() -> Self {
Self::new()
}
}
#[async_trait]
impl Fetcher for DocsSiteFetcher {
fn name(&self) -> &'static str {
"docs_site"
}
fn matches(&self, url: &Url) -> bool {
Self::is_llms_txt_url(url) || Self::is_docs_site(url)
}
async fn fetch(
&self,
request: &FetchRequest,
options: &FetchOptions,
) -> Result<FetchResponse, FetchError> {
let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?;
let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT);
let ua_header = HeaderValue::from_str(user_agent)
.unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT));
if Self::is_llms_txt_url(&url) {
return fetch_llms_txt_direct(url, ua_header, options).await;
}
let origin = format!(
"{}://{}{}",
url.scheme(),
url.host_str().unwrap_or_default(),
url.port().map(|p| format!(":{}", p)).unwrap_or_default()
);
let probe_urls = [
(format!("{}/llms-full.txt", origin), "llms-full.txt"),
(format!("{}/llms.txt", origin), "llms.txt"),
];
for (probe_url, source) in &probe_urls {
let probe_url = Url::parse(probe_url).map_err(|_| FetchError::InvalidUrlScheme)?;
if let Some(content) = try_fetch_llms_txt(probe_url, ua_header.clone(), options).await {
return Ok(FetchResponse {
url: request.url.clone(),
status_code: 200,
content_type: Some("text/plain".to_string()),
format: Some("documentation".to_string()),
content: Some(format!("<!-- Source: {} -->\n\n{}", source, content)),
..Default::default()
});
}
}
let mut headers = HeaderMap::new();
headers.insert(USER_AGENT, ua_header);
headers.insert(
ACCEPT,
HeaderValue::from_static("text/html, text/plain, text/markdown, */*"),
);
let headers = apply_bot_auth_if_enabled(headers, options, &url);
let (response, redirect_chain) = send_request_following_redirects(
url,
reqwest::Method::GET,
headers,
options,
PROBE_TIMEOUT,
)
.await?;
let status_code = response.status().as_u16();
let final_url = response.url().to_string();
let content_type = response
.headers()
.get("content-type")
.and_then(|v| v.to_str().ok())
.map(|s| s.to_string());
let body = response
.text()
.await
.map_err(|e| FetchError::RequestError(e.to_string()))?;
let (content, format) = if content_type
.as_deref()
.is_some_and(|ct| ct.contains("text/html"))
{
(
crate::convert::html_to_markdown(&body),
"markdown".to_string(),
)
} else {
(body, "documentation".to_string())
};
Ok(FetchResponse {
url: final_url,
status_code,
content_type,
format: Some(format),
content: Some(content),
redirect_chain,
..Default::default()
})
}
}
async fn fetch_llms_txt_direct(
url: Url,
ua_header: HeaderValue,
options: &FetchOptions,
) -> Result<FetchResponse, FetchError> {
let mut headers = HeaderMap::new();
headers.insert(USER_AGENT, ua_header);
headers.insert(
ACCEPT,
HeaderValue::from_static("text/plain, text/markdown, */*"),
);
let headers = apply_bot_auth_if_enabled(headers, options, &url);
let (response, redirect_chain) = send_request_following_redirects(
url,
reqwest::Method::GET,
headers,
options,
PROBE_TIMEOUT,
)
.await?;
let status_code = response.status().as_u16();
let final_url = response.url().to_string();
let content_type = response
.headers()
.get("content-type")
.and_then(|v| v.to_str().ok())
.map(|s| s.to_string());
if !response.status().is_success() {
return Ok(FetchResponse {
url: final_url,
status_code,
redirect_chain,
error: Some(format!("HTTP {}", status_code)),
..Default::default()
});
}
let max_body_size = options.max_body_size.unwrap_or(DEFAULT_MAX_BODY_SIZE);
let (body, truncated) = read_body_with_timeout(response, BODY_TIMEOUT, max_body_size).await?;
let size = body.len() as u64;
let mut content = String::from_utf8_lossy(&body).to_string();
if truncated {
content.push_str(TRUNCATION_MESSAGE);
}
Ok(FetchResponse {
url: final_url,
status_code: 200,
content_type,
format: Some("documentation".to_string()),
content: Some(content),
size: Some(size),
truncated: if truncated { Some(true) } else { None },
redirect_chain,
..Default::default()
})
}
async fn try_fetch_llms_txt(
url: Url,
ua_header: HeaderValue,
options: &FetchOptions,
) -> Option<String> {
let mut headers = HeaderMap::new();
headers.insert(USER_AGENT, ua_header);
headers.insert(
ACCEPT,
HeaderValue::from_static("text/plain, text/markdown, */*"),
);
let headers = apply_bot_auth_if_enabled(headers, options, &url);
let (response, _) = send_request_following_redirects(
url,
reqwest::Method::GET,
headers,
options,
PROBE_TIMEOUT,
)
.await
.ok()?;
if !response.status().is_success() {
return None;
}
let content_type = response
.headers()
.get("content-type")
.and_then(|v| v.to_str().ok())
.unwrap_or("");
if content_type.contains("text/html") {
return None;
}
let body = response.bytes().await.ok()?;
if body.len() > MAX_LLMS_TXT_SIZE {
return None;
}
let text = String::from_utf8(body.to_vec()).ok()?;
if text.trim().is_empty() {
return None;
}
Some(text)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_llms_txt_url() {
let url = Url::parse("https://example.com/llms.txt").unwrap();
assert!(DocsSiteFetcher::is_llms_txt_url(&url));
let url = Url::parse("https://example.com/llms-full.txt").unwrap();
assert!(DocsSiteFetcher::is_llms_txt_url(&url));
let url = Url::parse("https://example.com/other.txt").unwrap();
assert!(!DocsSiteFetcher::is_llms_txt_url(&url));
}
#[test]
fn test_is_docs_site() {
let url = Url::parse("https://my-project.readthedocs.io/en/latest/").unwrap();
assert!(DocsSiteFetcher::is_docs_site(&url));
let url = Url::parse("https://docs.rs/tokio/latest/tokio/").unwrap();
assert!(DocsSiteFetcher::is_docs_site(&url));
let url = Url::parse("https://docs.python.org/3/library/").unwrap();
assert!(DocsSiteFetcher::is_docs_site(&url));
let url = Url::parse("https://developer.mozilla.org/en-US/docs/Web").unwrap();
assert!(DocsSiteFetcher::is_docs_site(&url));
let url = Url::parse("https://my-project.gitbook.io/docs/").unwrap();
assert!(DocsSiteFetcher::is_docs_site(&url));
let url = Url::parse("https://github.com/owner/repo").unwrap();
assert!(!DocsSiteFetcher::is_docs_site(&url));
let url = Url::parse("https://example.com/page").unwrap();
assert!(!DocsSiteFetcher::is_docs_site(&url));
}
#[test]
fn test_fetcher_matches() {
let fetcher = DocsSiteFetcher::new();
let url = Url::parse("https://example.com/llms.txt").unwrap();
assert!(fetcher.matches(&url));
let url = Url::parse("https://docs.rs/tokio/latest/tokio/").unwrap();
assert!(fetcher.matches(&url));
let url = Url::parse("https://github.com/owner/repo").unwrap();
assert!(!fetcher.matches(&url));
}
}