Skip to main content

blz_core/
refresh.rs

1//! Refresh helpers shared by CLI and MCP consumers.
2
3use std::path::PathBuf;
4
5use crate::{
6    FetchResult, Fetcher, HeadingFilterStats, LanguageFilter, MarkdownParser, ParseResult,
7    PerformanceMetrics, Result, SearchIndex, Source, SourceType, Storage, TocEntry,
8};
9
10use crate::json_builder::build_llms_json;
11use crate::url_resolver::resolve_best_url;
12
13/// Abstraction over storage interactions used by refresh routines.
14pub trait RefreshStorage {
15    /// Load stored metadata for a source alias.
16    fn load_metadata(&self, alias: &str) -> Result<Source>;
17    /// Load alias list from the cached llms.json for a source.
18    fn load_llms_aliases(&self, alias: &str) -> Result<Vec<String>>;
19    /// Persist the latest llms.txt content.
20    fn save_llms_txt(&self, alias: &str, content: &str) -> Result<()>;
21    /// Persist the computed llms.json metadata payload.
22    fn save_llms_json(&self, alias: &str, data: &crate::LlmsJson) -> Result<()>;
23    /// Persist updated source metadata.
24    fn save_metadata(&self, alias: &str, metadata: &Source) -> Result<()>;
25    /// Resolve the on-disk index path for a source.
26    fn index_path(&self, alias: &str) -> Result<PathBuf>;
27    /// Load cached llms.txt content for a source.
28    fn load_llms_txt(&self, alias: &str) -> Result<String>;
29}
30
31impl RefreshStorage for Storage {
32    fn load_metadata(&self, alias: &str) -> Result<Source> {
33        Self::load_source_metadata(self, alias)?
34            .ok_or_else(|| crate::Error::NotFound(format!("Missing metadata for {alias}")))
35    }
36
37    fn load_llms_aliases(&self, alias: &str) -> Result<Vec<String>> {
38        match Self::load_llms_json(self, alias) {
39            Ok(llms) => Ok(llms.metadata.aliases),
40            Err(_) => Ok(Vec::new()),
41        }
42    }
43
44    fn save_llms_txt(&self, alias: &str, content: &str) -> Result<()> {
45        Self::save_llms_txt(self, alias, content)
46    }
47
48    fn save_llms_json(&self, alias: &str, data: &crate::LlmsJson) -> Result<()> {
49        Self::save_llms_json(self, alias, data)
50    }
51
52    fn save_metadata(&self, alias: &str, metadata: &Source) -> Result<()> {
53        Self::save_source_metadata(self, alias, metadata)
54    }
55
56    fn index_path(&self, alias: &str) -> Result<PathBuf> {
57        Self::index_dir(self, alias)
58    }
59
60    fn load_llms_txt(&self, alias: &str) -> Result<String> {
61        Self::load_llms_txt(self, alias)
62    }
63}
64
65/// Interface for indexing refreshed content.
66pub trait RefreshIndexer {
67    /// Index a set of heading blocks for the given alias.
68    fn index(
69        &self,
70        alias: &str,
71        index_path: &std::path::Path,
72        metrics: PerformanceMetrics,
73        blocks: &[crate::HeadingBlock],
74    ) -> Result<()>;
75}
76
77/// Default indexer that writes to the Tantivy search index.
78#[derive(Default)]
79pub struct DefaultRefreshIndexer;
80
81impl RefreshIndexer for DefaultRefreshIndexer {
82    fn index(
83        &self,
84        alias: &str,
85        index_path: &std::path::Path,
86        metrics: PerformanceMetrics,
87        blocks: &[crate::HeadingBlock],
88    ) -> Result<()> {
89        let index = SearchIndex::create_or_open(index_path)?.with_metrics(metrics);
90        index.index_blocks(alias, blocks)
91    }
92}
93
94/// Result summary for a refresh operation.
95#[derive(Debug, Clone, PartialEq, Eq)]
96pub enum RefreshOutcome {
97    /// The source content changed and was re-indexed.
98    Refreshed {
99        /// Canonical alias for the refreshed source.
100        alias: String,
101        /// Number of headings indexed.
102        headings: usize,
103        /// Total line count in the source.
104        lines: usize,
105    },
106    /// The source content was unchanged.
107    Unchanged {
108        /// Canonical alias for the unchanged source.
109        alias: String,
110    },
111}
112
113/// Result summary for a reindex operation.
114#[derive(Debug, Clone, PartialEq, Eq)]
115pub struct ReindexOutcome {
116    /// Canonical alias for the reindexed source.
117    pub alias: String,
118    /// Heading count before filtering.
119    pub headings_before: usize,
120    /// Heading count after filtering.
121    pub headings_after: usize,
122    /// Number of headings filtered out.
123    pub filtered: usize,
124}
125
126/// Data describing remote changes.
127#[derive(Debug, Clone)]
128pub struct RefreshPayload {
129    /// Refreshed llms.txt content.
130    pub content: String,
131    /// SHA256 hash of the refreshed content.
132    pub sha256: String,
133    /// `ETag` header value from the response.
134    pub etag: Option<String>,
135    /// Last-Modified header value from the response.
136    pub last_modified: Option<String>,
137}
138
139/// URL resolution details for refresh operations.
140#[derive(Debug, Clone, PartialEq, Eq)]
141pub struct RefreshUrlResolution {
142    /// Final URL used for refresh.
143    pub final_url: String,
144    /// Variant chosen for the URL.
145    pub variant: crate::SourceVariant,
146    /// Whether the source was upgraded to llms-full.txt.
147    pub upgraded: bool,
148}
149
150/// Resolve the best refresh URL (llms.txt → llms-full.txt) when available.
151pub async fn resolve_refresh_url(
152    fetcher: &Fetcher,
153    metadata: &Source,
154) -> Result<RefreshUrlResolution> {
155    if metadata.variant != crate::SourceVariant::Llms {
156        return Ok(RefreshUrlResolution {
157            final_url: metadata.url.clone(),
158            variant: metadata.variant.clone(),
159            upgraded: false,
160        });
161    }
162
163    match resolve_best_url(fetcher, &metadata.url).await {
164        Ok(resolved) if resolved.variant == crate::SourceVariant::LlmsFull => {
165            Ok(RefreshUrlResolution {
166                final_url: resolved.final_url,
167                variant: resolved.variant,
168                upgraded: true,
169            })
170        },
171        Ok(_) | Err(_) => Ok(RefreshUrlResolution {
172            final_url: metadata.url.clone(),
173            variant: metadata.variant.clone(),
174            upgraded: false,
175        }),
176    }
177}
178
179/// Refresh a source using its current metadata.
180///
181/// # Errors
182///
183/// Returns an error if storage access, fetching, or indexing fails.
184pub async fn refresh_source<S, I>(
185    storage: &S,
186    fetcher: &Fetcher,
187    alias: &str,
188    metrics: PerformanceMetrics,
189    indexer: &I,
190    filter_preference: bool,
191) -> Result<RefreshOutcome>
192where
193    S: RefreshStorage + Sync,
194    I: RefreshIndexer + Sync,
195{
196    let existing_metadata = storage.load_metadata(alias)?;
197    let existing_aliases = storage.load_llms_aliases(alias)?;
198    let resolution = resolve_refresh_url(fetcher, &existing_metadata).await?;
199
200    refresh_source_with_metadata(
201        storage,
202        fetcher,
203        alias,
204        existing_metadata,
205        existing_aliases,
206        &resolution,
207        metrics,
208        indexer,
209        filter_preference,
210    )
211    .await
212}
213
214/// Refresh a source using preloaded metadata and URL resolution.
215///
216/// # Errors
217///
218/// Returns an error if fetching, parsing, or indexing fails.
219#[allow(clippy::too_many_arguments)]
220pub async fn refresh_source_with_metadata<S, I>(
221    storage: &S,
222    fetcher: &Fetcher,
223    alias: &str,
224    existing_metadata: Source,
225    existing_aliases: Vec<String>,
226    resolution: &RefreshUrlResolution,
227    metrics: PerformanceMetrics,
228    indexer: &I,
229    filter_preference: bool,
230) -> Result<RefreshOutcome>
231where
232    S: RefreshStorage + Sync,
233    I: RefreshIndexer + Sync,
234{
235    let fetch_result = fetcher
236        .fetch_with_cache(
237            &resolution.final_url,
238            existing_metadata.etag.as_deref(),
239            existing_metadata.last_modified.as_deref(),
240        )
241        .await?;
242
243    match fetch_result {
244        FetchResult::NotModified { .. } => {
245            if existing_metadata.filter_non_english.unwrap_or(true) != filter_preference {
246                let mut updated_metadata = existing_metadata.clone();
247                updated_metadata.filter_non_english = Some(filter_preference);
248                storage.save_metadata(alias, &updated_metadata)?;
249            }
250            Ok(RefreshOutcome::Unchanged {
251                alias: alias.to_string(),
252            })
253        },
254        FetchResult::Modified {
255            content,
256            sha256,
257            etag,
258            last_modified,
259        } => {
260            let payload = RefreshPayload {
261                content,
262                sha256,
263                etag,
264                last_modified,
265            };
266
267            let mut updated_metadata = existing_metadata.clone();
268            updated_metadata.url.clone_from(&resolution.final_url);
269            updated_metadata.variant = resolution.variant.clone();
270            updated_metadata.filter_non_english = Some(filter_preference);
271
272            apply_refresh(
273                storage,
274                alias,
275                updated_metadata,
276                existing_aliases,
277                &payload,
278                metrics,
279                indexer,
280            )
281        },
282    }
283}
284
285/// Re-parse and re-index a source using cached content.
286///
287/// # Errors
288///
289/// Returns an error if cached content cannot be parsed or indexed.
290pub fn reindex_source<S, I>(
291    storage: &S,
292    alias: &str,
293    metrics: PerformanceMetrics,
294    indexer: &I,
295    filter_preference: bool,
296) -> Result<ReindexOutcome>
297where
298    S: RefreshStorage,
299    I: RefreshIndexer,
300{
301    let content = storage.load_llms_txt(alias)?;
302    let mut parser = MarkdownParser::new()?;
303    let mut parse_result = parser.parse(&content)?;
304
305    let before_count = parse_result.heading_blocks.len();
306    apply_language_filter(&mut parse_result, filter_preference);
307    let after_count = parse_result.heading_blocks.len();
308
309    let index_path = storage.index_path(alias)?;
310    indexer.index(
311        alias,
312        index_path.as_path(),
313        metrics,
314        &parse_result.heading_blocks,
315    )?;
316
317    Ok(ReindexOutcome {
318        alias: alias.to_string(),
319        headings_before: before_count,
320        headings_after: after_count,
321        filtered: before_count.saturating_sub(after_count),
322    })
323}
324
325/// Merge aliases from existing metadata with any already-known aliases.
326fn merge_aliases(existing_aliases: Vec<String>, metadata_aliases: &[String]) -> Vec<String> {
327    let mut merged = existing_aliases;
328    for alias_value in metadata_aliases {
329        if !merged.contains(alias_value) {
330            merged.push(alias_value.clone());
331        }
332    }
333    merged.sort();
334    merged.dedup();
335    merged
336}
337
338/// Copy preserved fields from existing metadata into the new `llms_json`.
339fn copy_preserved_metadata_fields(llms_json: &mut crate::LlmsJson, existing: &Source) {
340    llms_json.metadata.tags.clone_from(&existing.tags);
341    llms_json
342        .metadata
343        .description
344        .clone_from(&existing.description);
345    llms_json.metadata.category.clone_from(&existing.category);
346    llms_json
347        .metadata
348        .npm_aliases
349        .clone_from(&existing.npm_aliases);
350    llms_json
351        .metadata
352        .github_aliases
353        .clone_from(&existing.github_aliases);
354    llms_json.metadata.variant = existing.variant.clone();
355}
356
357/// Resolve the source origin based on existing metadata.
358fn resolve_origin(existing: &Source) -> crate::SourceOrigin {
359    let mut origin = existing.origin.clone();
360    origin.source_type = match (&origin.source_type, &existing.origin.source_type) {
361        (Some(SourceType::Remote { .. }), _) | (None, None) => Some(SourceType::Remote {
362            url: existing.url.clone(),
363        }),
364        (Some(SourceType::LocalFile { path }), _) => {
365            Some(SourceType::LocalFile { path: path.clone() })
366        },
367        (None, Some(existing_type)) => Some(existing_type.clone()),
368    };
369    origin
370}
371
372/// Build the updated Source metadata for a refresh.
373fn build_refresh_metadata(
374    existing: Source,
375    payload: &RefreshPayload,
376    origin: crate::SourceOrigin,
377) -> Source {
378    Source {
379        url: existing.url,
380        etag: payload.etag.clone(),
381        last_modified: payload.last_modified.clone(),
382        fetched_at: chrono::Utc::now(),
383        sha256: payload.sha256.clone(),
384        variant: existing.variant,
385        aliases: existing.aliases,
386        tags: existing.tags,
387        description: existing.description,
388        category: existing.category,
389        npm_aliases: existing.npm_aliases,
390        github_aliases: existing.github_aliases,
391        origin,
392        filter_non_english: existing.filter_non_english,
393    }
394}
395
396/// Apply a refresh: persist content and re-index the source.
397///
398/// # Errors
399///
400/// Returns an error if parsing, persistence, or indexing fails.
401#[allow(clippy::too_many_arguments)]
402pub fn apply_refresh<S, I>(
403    storage: &S,
404    alias: &str,
405    existing_metadata: Source,
406    existing_aliases: Vec<String>,
407    payload: &RefreshPayload,
408    metrics: PerformanceMetrics,
409    indexer: &I,
410) -> Result<RefreshOutcome>
411where
412    S: RefreshStorage,
413    I: RefreshIndexer,
414{
415    let mut parser = MarkdownParser::new()?;
416    let mut parse_result = parser.parse(&payload.content)?;
417
418    let filter_enabled = existing_metadata.filter_non_english.unwrap_or(true);
419    let filter_stats = Some(apply_language_filter(&mut parse_result, filter_enabled));
420
421    storage.save_llms_txt(alias, &payload.content)?;
422
423    let mut llms_json = build_llms_json(
424        alias,
425        &existing_metadata.url,
426        "llms.txt",
427        payload.sha256.clone(),
428        payload.etag.clone(),
429        payload.last_modified.clone(),
430        &parse_result,
431    );
432
433    llms_json.metadata.aliases = merge_aliases(existing_aliases, &existing_metadata.aliases);
434    copy_preserved_metadata_fields(&mut llms_json, &existing_metadata);
435    llms_json.filter_stats = filter_stats;
436
437    storage.save_llms_json(alias, &llms_json)?;
438
439    let origin = resolve_origin(&existing_metadata);
440    llms_json.metadata.origin = origin.clone();
441
442    let metadata = build_refresh_metadata(existing_metadata, payload, origin);
443    storage.save_metadata(alias, &metadata)?;
444
445    let index_path = storage.index_path(alias)?;
446    indexer.index(
447        alias,
448        index_path.as_path(),
449        metrics,
450        &parse_result.heading_blocks,
451    )?;
452
453    Ok(RefreshOutcome::Refreshed {
454        alias: alias.to_string(),
455        headings: count_headings(&llms_json.toc),
456        lines: llms_json.line_index.total_lines,
457    })
458}
459
460fn apply_language_filter(
461    parse_result: &mut ParseResult,
462    filter_enabled: bool,
463) -> HeadingFilterStats {
464    let original_count = parse_result.heading_blocks.len();
465    if filter_enabled {
466        let mut language_filter = LanguageFilter::new(true);
467        parse_result.heading_blocks.retain(|block| {
468            let urls_in_content = extract_urls_from_content(&block.content);
469            let url_check = urls_in_content.is_empty()
470                || urls_in_content
471                    .iter()
472                    .all(|url| language_filter.is_english_url(url));
473
474            let heading_check = language_filter.is_english_heading_path(&block.path);
475
476            url_check && heading_check
477        });
478    }
479
480    let accepted = parse_result.heading_blocks.len();
481    let filtered_count = original_count.saturating_sub(accepted);
482    HeadingFilterStats {
483        enabled: filter_enabled,
484        headings_total: original_count,
485        headings_accepted: accepted,
486        headings_rejected: filtered_count,
487        reason: if filter_enabled {
488            "non-English content removed".to_string()
489        } else {
490            "filtering disabled".to_string()
491        },
492    }
493}
494
495fn count_headings(entries: &[TocEntry]) -> usize {
496    entries
497        .iter()
498        .map(|entry| 1 + count_headings(&entry.children))
499        .sum()
500}
501
502fn extract_urls_from_content(content: &str) -> Vec<String> {
503    let mut urls = Vec::new();
504
505    let mut search_start = 0;
506    while let Some(rel) = content[search_start..].find('[') {
507        let open_idx = search_start + rel;
508        if let Some(close_rel) = content[open_idx + 1..].find(']') {
509            let close_idx = open_idx + 1 + close_rel;
510            let after_bracket = content.get(close_idx + 1..).unwrap_or("");
511            if let Some(rest) = after_bracket.strip_prefix('(') {
512                if let Some(paren_rel) = rest.find(')') {
513                    if let Some(cleaned) = clean_url_slice(&rest[..paren_rel]) {
514                        urls.push(cleaned.to_string());
515                    }
516                }
517            }
518        }
519        search_start = open_idx + 1;
520    }
521
522    urls
523}
524
525fn clean_url_slice(s: &str) -> Option<&str> {
526    let trimmed = s.trim();
527    if trimmed.is_empty() {
528        return None;
529    }
530
531    let trimmed = trimmed
532        .strip_prefix('"')
533        .or_else(|| trimmed.strip_prefix('\''))
534        .unwrap_or(trimmed);
535
536    let trimmed = trimmed
537        .strip_suffix('"')
538        .or_else(|| trimmed.strip_suffix('\''))
539        .unwrap_or(trimmed);
540
541    let mut end = trimmed.len();
542    for (idx, ch) in trimmed.char_indices().rev() {
543        if trailing_punctuation(ch) {
544            end = idx;
545        } else {
546            break;
547        }
548    }
549
550    if end == 0 {
551        None
552    } else {
553        Some(&trimmed[..end])
554    }
555}
556
557const fn trailing_punctuation(c: char) -> bool {
558    matches!(c, ',' | '.' | ';' | ':' | '!' | '?')
559}
560
561#[cfg(test)]
562mod tests {
563    use super::*;
564    use std::cell::RefCell;
565    use std::collections::HashMap;
566
567    use anyhow::Result;
568
569    #[derive(Default)]
570    struct MockStorage {
571        metadata: HashMap<String, Source>,
572        saved_txt: RefCell<Vec<String>>,
573        saved_json: RefCell<Vec<String>>,
574        saved_metadata: RefCell<Vec<Source>>,
575        index_paths: HashMap<String, PathBuf>,
576        cached_txt: HashMap<String, String>,
577    }
578
579    impl RefreshStorage for MockStorage {
580        fn load_metadata(&self, alias: &str) -> crate::Result<Source> {
581            self.metadata
582                .get(alias)
583                .cloned()
584                .ok_or_else(|| crate::Error::NotFound("missing metadata".to_string()))
585        }
586
587        fn load_llms_aliases(&self, _alias: &str) -> crate::Result<Vec<String>> {
588            Ok(Vec::new())
589        }
590
591        fn save_llms_txt(&self, alias: &str, _content: &str) -> crate::Result<()> {
592            self.saved_txt.borrow_mut().push(alias.to_string());
593            Ok(())
594        }
595
596        fn save_llms_json(&self, alias: &str, _data: &crate::LlmsJson) -> crate::Result<()> {
597            self.saved_json.borrow_mut().push(alias.to_string());
598            Ok(())
599        }
600
601        fn save_metadata(&self, _alias: &str, metadata: &Source) -> crate::Result<()> {
602            self.saved_metadata.borrow_mut().push(metadata.clone());
603            Ok(())
604        }
605
606        fn index_path(&self, alias: &str) -> crate::Result<PathBuf> {
607            self.index_paths
608                .get(alias)
609                .cloned()
610                .ok_or_else(|| crate::Error::NotFound("missing index path".to_string()))
611        }
612
613        fn load_llms_txt(&self, alias: &str) -> crate::Result<String> {
614            self.cached_txt
615                .get(alias)
616                .cloned()
617                .ok_or_else(|| crate::Error::NotFound(format!("missing llms.txt for {alias}")))
618        }
619    }
620
621    #[derive(Default)]
622    struct MockIndexer {
623        indexed: RefCell<Vec<String>>,
624    }
625
626    impl RefreshIndexer for MockIndexer {
627        fn index(
628            &self,
629            alias: &str,
630            _index_path: &std::path::Path,
631            _metrics: PerformanceMetrics,
632            _blocks: &[crate::HeadingBlock],
633        ) -> crate::Result<()> {
634            self.indexed.borrow_mut().push(alias.to_string());
635            Ok(())
636        }
637    }
638
639    fn sample_source() -> Source {
640        Source {
641            url: "https://example.com/llms.txt".to_string(),
642            etag: None,
643            last_modified: None,
644            fetched_at: chrono::Utc::now(),
645            sha256: "abc123".to_string(),
646            variant: crate::SourceVariant::Llms,
647            aliases: Vec::new(),
648            tags: Vec::new(),
649            description: None,
650            category: None,
651            npm_aliases: Vec::new(),
652            github_aliases: Vec::new(),
653            origin: crate::SourceOrigin {
654                manifest: None,
655                source_type: Some(SourceType::Remote {
656                    url: "https://example.com/llms.txt".to_string(),
657                }),
658            },
659            filter_non_english: Some(true),
660        }
661    }
662
663    fn sample_payload() -> RefreshPayload {
664        RefreshPayload {
665            content: "# Title\n\nSome content.\n".to_string(),
666            sha256: "abc123".to_string(),
667            etag: None,
668            last_modified: None,
669        }
670    }
671
672    #[test]
673    fn apply_refresh_persists_changes() -> Result<()> {
674        let mut storage = MockStorage::default();
675        storage.metadata.insert("test".to_string(), sample_source());
676        storage
677            .index_paths
678            .insert("test".to_string(), PathBuf::from("index"));
679
680        let indexer = MockIndexer::default();
681        let outcome = apply_refresh(
682            &storage,
683            "test",
684            sample_source(),
685            Vec::new(),
686            &sample_payload(),
687            PerformanceMetrics::default(),
688            &indexer,
689        )?;
690
691        assert!(matches!(outcome, RefreshOutcome::Refreshed { .. }));
692        assert_eq!(storage.saved_txt.borrow().len(), 1);
693        assert_eq!(storage.saved_json.borrow().len(), 1);
694        assert_eq!(storage.saved_metadata.borrow().len(), 1);
695        assert_eq!(indexer.indexed.borrow().len(), 1);
696        Ok(())
697    }
698}