use std::path::PathBuf;
use crate::{
FetchResult, Fetcher, HeadingFilterStats, LanguageFilter, MarkdownParser, ParseResult,
PerformanceMetrics, Result, SearchIndex, Source, SourceType, Storage, TocEntry,
};
use crate::json_builder::build_llms_json;
use crate::url_resolver::resolve_best_url;
pub trait RefreshStorage {
fn load_metadata(&self, alias: &str) -> Result<Source>;
fn load_llms_aliases(&self, alias: &str) -> Result<Vec<String>>;
fn save_llms_txt(&self, alias: &str, content: &str) -> Result<()>;
fn save_llms_json(&self, alias: &str, data: &crate::LlmsJson) -> Result<()>;
fn save_metadata(&self, alias: &str, metadata: &Source) -> Result<()>;
fn index_path(&self, alias: &str) -> Result<PathBuf>;
fn load_llms_txt(&self, alias: &str) -> Result<String>;
}
impl RefreshStorage for Storage {
fn load_metadata(&self, alias: &str) -> Result<Source> {
Self::load_source_metadata(self, alias)?
.ok_or_else(|| crate::Error::NotFound(format!("Missing metadata for {alias}")))
}
fn load_llms_aliases(&self, alias: &str) -> Result<Vec<String>> {
match Self::load_llms_json(self, alias) {
Ok(llms) => Ok(llms.metadata.aliases),
Err(_) => Ok(Vec::new()),
}
}
fn save_llms_txt(&self, alias: &str, content: &str) -> Result<()> {
Self::save_llms_txt(self, alias, content)
}
fn save_llms_json(&self, alias: &str, data: &crate::LlmsJson) -> Result<()> {
Self::save_llms_json(self, alias, data)
}
fn save_metadata(&self, alias: &str, metadata: &Source) -> Result<()> {
Self::save_source_metadata(self, alias, metadata)
}
fn index_path(&self, alias: &str) -> Result<PathBuf> {
Self::index_dir(self, alias)
}
fn load_llms_txt(&self, alias: &str) -> Result<String> {
Self::load_llms_txt(self, alias)
}
}
pub trait RefreshIndexer {
fn index(
&self,
alias: &str,
index_path: &std::path::Path,
metrics: PerformanceMetrics,
blocks: &[crate::HeadingBlock],
) -> Result<()>;
}
#[derive(Default)]
pub struct DefaultRefreshIndexer;
impl RefreshIndexer for DefaultRefreshIndexer {
fn index(
&self,
alias: &str,
index_path: &std::path::Path,
metrics: PerformanceMetrics,
blocks: &[crate::HeadingBlock],
) -> Result<()> {
let index = SearchIndex::create_or_open(index_path)?.with_metrics(metrics);
index.index_blocks(alias, blocks)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum RefreshOutcome {
Refreshed {
alias: String,
headings: usize,
lines: usize,
},
Unchanged {
alias: String,
},
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ReindexOutcome {
pub alias: String,
pub headings_before: usize,
pub headings_after: usize,
pub filtered: usize,
}
#[derive(Debug, Clone)]
pub struct RefreshPayload {
pub content: String,
pub sha256: String,
pub etag: Option<String>,
pub last_modified: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RefreshUrlResolution {
pub final_url: String,
pub variant: crate::SourceVariant,
pub upgraded: bool,
}
pub async fn resolve_refresh_url(
fetcher: &Fetcher,
metadata: &Source,
) -> Result<RefreshUrlResolution> {
if metadata.variant != crate::SourceVariant::Llms {
return Ok(RefreshUrlResolution {
final_url: metadata.url.clone(),
variant: metadata.variant.clone(),
upgraded: false,
});
}
match resolve_best_url(fetcher, &metadata.url).await {
Ok(resolved) if resolved.variant == crate::SourceVariant::LlmsFull => {
Ok(RefreshUrlResolution {
final_url: resolved.final_url,
variant: resolved.variant,
upgraded: true,
})
},
Ok(_) | Err(_) => Ok(RefreshUrlResolution {
final_url: metadata.url.clone(),
variant: metadata.variant.clone(),
upgraded: false,
}),
}
}
pub async fn refresh_source<S, I>(
storage: &S,
fetcher: &Fetcher,
alias: &str,
metrics: PerformanceMetrics,
indexer: &I,
filter_preference: bool,
) -> Result<RefreshOutcome>
where
S: RefreshStorage + Sync,
I: RefreshIndexer + Sync,
{
let existing_metadata = storage.load_metadata(alias)?;
let existing_aliases = storage.load_llms_aliases(alias)?;
let resolution = resolve_refresh_url(fetcher, &existing_metadata).await?;
refresh_source_with_metadata(
storage,
fetcher,
alias,
existing_metadata,
existing_aliases,
&resolution,
metrics,
indexer,
filter_preference,
)
.await
}
#[allow(clippy::too_many_arguments)]
pub async fn refresh_source_with_metadata<S, I>(
storage: &S,
fetcher: &Fetcher,
alias: &str,
existing_metadata: Source,
existing_aliases: Vec<String>,
resolution: &RefreshUrlResolution,
metrics: PerformanceMetrics,
indexer: &I,
filter_preference: bool,
) -> Result<RefreshOutcome>
where
S: RefreshStorage + Sync,
I: RefreshIndexer + Sync,
{
let fetch_result = fetcher
.fetch_with_cache(
&resolution.final_url,
existing_metadata.etag.as_deref(),
existing_metadata.last_modified.as_deref(),
)
.await?;
match fetch_result {
FetchResult::NotModified { .. } => {
if existing_metadata.filter_non_english.unwrap_or(true) != filter_preference {
let mut updated_metadata = existing_metadata.clone();
updated_metadata.filter_non_english = Some(filter_preference);
storage.save_metadata(alias, &updated_metadata)?;
}
Ok(RefreshOutcome::Unchanged {
alias: alias.to_string(),
})
},
FetchResult::Modified {
content,
sha256,
etag,
last_modified,
} => {
let payload = RefreshPayload {
content,
sha256,
etag,
last_modified,
};
let mut updated_metadata = existing_metadata.clone();
updated_metadata.url.clone_from(&resolution.final_url);
updated_metadata.variant = resolution.variant.clone();
updated_metadata.filter_non_english = Some(filter_preference);
apply_refresh(
storage,
alias,
updated_metadata,
existing_aliases,
payload,
metrics,
indexer,
)
},
}
}
pub fn reindex_source<S, I>(
storage: &S,
alias: &str,
metrics: PerformanceMetrics,
indexer: &I,
filter_preference: bool,
) -> Result<ReindexOutcome>
where
S: RefreshStorage,
I: RefreshIndexer,
{
let content = storage.load_llms_txt(alias)?;
let mut parser = MarkdownParser::new()?;
let mut parse_result = parser.parse(&content)?;
let before_count = parse_result.heading_blocks.len();
apply_language_filter(&mut parse_result, filter_preference);
let after_count = parse_result.heading_blocks.len();
let index_path = storage.index_path(alias)?;
indexer.index(
alias,
index_path.as_path(),
metrics,
&parse_result.heading_blocks,
)?;
Ok(ReindexOutcome {
alias: alias.to_string(),
headings_before: before_count,
headings_after: after_count,
filtered: before_count.saturating_sub(after_count),
})
}
#[allow(clippy::too_many_arguments)]
#[allow(clippy::too_many_lines)]
pub fn apply_refresh<S, I>(
storage: &S,
alias: &str,
existing_metadata: Source,
existing_aliases: Vec<String>,
payload: RefreshPayload,
metrics: PerformanceMetrics,
indexer: &I,
) -> Result<RefreshOutcome>
where
S: RefreshStorage,
I: RefreshIndexer,
{
let mut parser = MarkdownParser::new()?;
let mut parse_result = parser.parse(&payload.content)?;
let filter_enabled = existing_metadata.filter_non_english.unwrap_or(true);
let filter_stats = Some(apply_language_filter(&mut parse_result, filter_enabled));
storage.save_llms_txt(alias, &payload.content)?;
let mut llms_json = build_llms_json(
alias,
&existing_metadata.url,
"llms.txt",
payload.sha256.clone(),
payload.etag.clone(),
payload.last_modified.clone(),
&parse_result,
);
let mut metadata_aliases = existing_aliases;
for alias_value in &existing_metadata.aliases {
if !metadata_aliases.contains(alias_value) {
metadata_aliases.push(alias_value.clone());
}
}
metadata_aliases.sort();
metadata_aliases.dedup();
llms_json.metadata.aliases = metadata_aliases;
llms_json.metadata.tags.clone_from(&existing_metadata.tags);
llms_json
.metadata
.description
.clone_from(&existing_metadata.description);
llms_json
.metadata
.category
.clone_from(&existing_metadata.category);
llms_json
.metadata
.npm_aliases
.clone_from(&existing_metadata.npm_aliases);
llms_json
.metadata
.github_aliases
.clone_from(&existing_metadata.github_aliases);
llms_json.metadata.variant = existing_metadata.variant.clone();
llms_json.filter_stats = filter_stats;
storage.save_llms_json(alias, &llms_json)?;
let mut origin = existing_metadata.origin.clone();
origin.source_type = match (&origin.source_type, &existing_metadata.origin.source_type) {
(Some(SourceType::Remote { .. }), _) | (None, None) => Some(SourceType::Remote {
url: existing_metadata.url.clone(),
}),
(Some(SourceType::LocalFile { path }), _) => {
Some(SourceType::LocalFile { path: path.clone() })
},
(None, Some(existing)) => Some(existing.clone()),
};
llms_json.metadata.origin = origin.clone();
let metadata = Source {
url: existing_metadata.url,
etag: payload.etag,
last_modified: payload.last_modified,
fetched_at: chrono::Utc::now(),
sha256: payload.sha256,
variant: existing_metadata.variant,
aliases: existing_metadata.aliases,
tags: existing_metadata.tags,
description: existing_metadata.description,
category: existing_metadata.category,
npm_aliases: existing_metadata.npm_aliases,
github_aliases: existing_metadata.github_aliases,
origin,
filter_non_english: existing_metadata.filter_non_english,
};
storage.save_metadata(alias, &metadata)?;
let index_path = storage.index_path(alias)?;
indexer.index(
alias,
index_path.as_path(),
metrics,
&parse_result.heading_blocks,
)?;
Ok(RefreshOutcome::Refreshed {
alias: alias.to_string(),
headings: count_headings(&llms_json.toc),
lines: llms_json.line_index.total_lines,
})
}
fn apply_language_filter(
parse_result: &mut ParseResult,
filter_enabled: bool,
) -> HeadingFilterStats {
let original_count = parse_result.heading_blocks.len();
if filter_enabled {
let mut language_filter = LanguageFilter::new(true);
parse_result.heading_blocks.retain(|block| {
let urls_in_content = extract_urls_from_content(&block.content);
let url_check = urls_in_content.is_empty()
|| urls_in_content
.iter()
.all(|url| language_filter.is_english_url(url));
let heading_check = language_filter.is_english_heading_path(&block.path);
url_check && heading_check
});
}
let accepted = parse_result.heading_blocks.len();
let filtered_count = original_count.saturating_sub(accepted);
HeadingFilterStats {
enabled: filter_enabled,
headings_total: original_count,
headings_accepted: accepted,
headings_rejected: filtered_count,
reason: if filter_enabled {
"non-English content removed".to_string()
} else {
"filtering disabled".to_string()
},
}
}
fn count_headings(entries: &[TocEntry]) -> usize {
entries
.iter()
.map(|entry| 1 + count_headings(&entry.children))
.sum()
}
fn extract_urls_from_content(content: &str) -> Vec<String> {
let mut urls = Vec::new();
let mut search_start = 0;
while let Some(rel) = content[search_start..].find('[') {
let open_idx = search_start + rel;
if let Some(close_rel) = content[open_idx + 1..].find(']') {
let close_idx = open_idx + 1 + close_rel;
let after_bracket = content.get(close_idx + 1..).unwrap_or("");
if let Some(rest) = after_bracket.strip_prefix('(') {
if let Some(paren_rel) = rest.find(')') {
if let Some(cleaned) = clean_url_slice(&rest[..paren_rel]) {
urls.push(cleaned.to_string());
}
}
}
}
search_start = open_idx + 1;
}
urls
}
fn clean_url_slice(s: &str) -> Option<&str> {
let trimmed = s.trim();
if trimmed.is_empty() {
return None;
}
let trimmed = trimmed
.strip_prefix('"')
.or_else(|| trimmed.strip_prefix('\''))
.unwrap_or(trimmed);
let trimmed = trimmed
.strip_suffix('"')
.or_else(|| trimmed.strip_suffix('\''))
.unwrap_or(trimmed);
let mut end = trimmed.len();
for (idx, ch) in trimmed.char_indices().rev() {
if trailing_punctuation(ch) {
end = idx;
} else {
break;
}
}
if end == 0 {
None
} else {
Some(&trimmed[..end])
}
}
const fn trailing_punctuation(c: char) -> bool {
matches!(c, ',' | '.' | ';' | ':' | '!' | '?')
}
#[cfg(test)]
mod tests {
use super::*;
use std::cell::RefCell;
use std::collections::HashMap;
use anyhow::Result;
#[derive(Default)]
struct MockStorage {
metadata: HashMap<String, Source>,
saved_txt: RefCell<Vec<String>>,
saved_json: RefCell<Vec<String>>,
saved_metadata: RefCell<Vec<Source>>,
index_paths: HashMap<String, PathBuf>,
cached_txt: HashMap<String, String>,
}
impl RefreshStorage for MockStorage {
fn load_metadata(&self, alias: &str) -> crate::Result<Source> {
self.metadata
.get(alias)
.cloned()
.ok_or_else(|| crate::Error::NotFound("missing metadata".to_string()))
}
fn load_llms_aliases(&self, _alias: &str) -> crate::Result<Vec<String>> {
Ok(Vec::new())
}
fn save_llms_txt(&self, alias: &str, _content: &str) -> crate::Result<()> {
self.saved_txt.borrow_mut().push(alias.to_string());
Ok(())
}
fn save_llms_json(&self, alias: &str, _data: &crate::LlmsJson) -> crate::Result<()> {
self.saved_json.borrow_mut().push(alias.to_string());
Ok(())
}
fn save_metadata(&self, _alias: &str, metadata: &Source) -> crate::Result<()> {
self.saved_metadata.borrow_mut().push(metadata.clone());
Ok(())
}
fn index_path(&self, alias: &str) -> crate::Result<PathBuf> {
self.index_paths
.get(alias)
.cloned()
.ok_or_else(|| crate::Error::NotFound("missing index path".to_string()))
}
fn load_llms_txt(&self, alias: &str) -> crate::Result<String> {
self.cached_txt
.get(alias)
.cloned()
.ok_or_else(|| crate::Error::NotFound(format!("missing llms.txt for {alias}")))
}
}
#[derive(Default)]
struct MockIndexer {
indexed: RefCell<Vec<String>>,
}
impl RefreshIndexer for MockIndexer {
fn index(
&self,
alias: &str,
_index_path: &std::path::Path,
_metrics: PerformanceMetrics,
_blocks: &[crate::HeadingBlock],
) -> crate::Result<()> {
self.indexed.borrow_mut().push(alias.to_string());
Ok(())
}
}
fn sample_source() -> Source {
Source {
url: "https://example.com/llms.txt".to_string(),
etag: None,
last_modified: None,
fetched_at: chrono::Utc::now(),
sha256: "abc123".to_string(),
variant: crate::SourceVariant::Llms,
aliases: Vec::new(),
tags: Vec::new(),
description: None,
category: None,
npm_aliases: Vec::new(),
github_aliases: Vec::new(),
origin: crate::SourceOrigin {
manifest: None,
source_type: Some(SourceType::Remote {
url: "https://example.com/llms.txt".to_string(),
}),
},
filter_non_english: Some(true),
}
}
fn sample_payload() -> RefreshPayload {
RefreshPayload {
content: "# Title\n\nSome content.\n".to_string(),
sha256: "abc123".to_string(),
etag: None,
last_modified: None,
}
}
#[test]
fn apply_refresh_persists_changes() -> Result<()> {
let mut storage = MockStorage::default();
storage.metadata.insert("test".to_string(), sample_source());
storage
.index_paths
.insert("test".to_string(), PathBuf::from("index"));
let indexer = MockIndexer::default();
let outcome = apply_refresh(
&storage,
"test",
sample_source(),
Vec::new(),
sample_payload(),
PerformanceMetrics::default(),
&indexer,
)?;
assert!(matches!(outcome, RefreshOutcome::Refreshed { .. }));
assert_eq!(storage.saved_txt.borrow().len(), 1);
assert_eq!(storage.saved_json.borrow().len(), 1);
assert_eq!(storage.saved_metadata.borrow().len(), 1);
assert_eq!(indexer.indexed.borrow().len(), 1);
Ok(())
}
}