blz_core/
refresh.rs

1//! Refresh helpers shared by CLI and MCP consumers.
2
3use std::path::PathBuf;
4
5use crate::{
6    FetchResult, Fetcher, HeadingFilterStats, LanguageFilter, MarkdownParser, ParseResult,
7    PerformanceMetrics, Result, SearchIndex, Source, SourceType, Storage, TocEntry,
8};
9
10use crate::json_builder::build_llms_json;
11use crate::url_resolver::resolve_best_url;
12
13/// Abstraction over storage interactions used by refresh routines.
14pub trait RefreshStorage {
15    /// Load stored metadata for a source alias.
16    fn load_metadata(&self, alias: &str) -> Result<Source>;
17    /// Load alias list from the cached llms.json for a source.
18    fn load_llms_aliases(&self, alias: &str) -> Result<Vec<String>>;
19    /// Persist the latest llms.txt content.
20    fn save_llms_txt(&self, alias: &str, content: &str) -> Result<()>;
21    /// Persist the computed llms.json metadata payload.
22    fn save_llms_json(&self, alias: &str, data: &crate::LlmsJson) -> Result<()>;
23    /// Persist updated source metadata.
24    fn save_metadata(&self, alias: &str, metadata: &Source) -> Result<()>;
25    /// Resolve the on-disk index path for a source.
26    fn index_path(&self, alias: &str) -> Result<PathBuf>;
27    /// Load cached llms.txt content for a source.
28    fn load_llms_txt(&self, alias: &str) -> Result<String>;
29}
30
31impl RefreshStorage for Storage {
32    fn load_metadata(&self, alias: &str) -> Result<Source> {
33        Self::load_source_metadata(self, alias)?
34            .ok_or_else(|| crate::Error::NotFound(format!("Missing metadata for {alias}")))
35    }
36
37    fn load_llms_aliases(&self, alias: &str) -> Result<Vec<String>> {
38        match Self::load_llms_json(self, alias) {
39            Ok(llms) => Ok(llms.metadata.aliases),
40            Err(_) => Ok(Vec::new()),
41        }
42    }
43
44    fn save_llms_txt(&self, alias: &str, content: &str) -> Result<()> {
45        Self::save_llms_txt(self, alias, content)
46    }
47
48    fn save_llms_json(&self, alias: &str, data: &crate::LlmsJson) -> Result<()> {
49        Self::save_llms_json(self, alias, data)
50    }
51
52    fn save_metadata(&self, alias: &str, metadata: &Source) -> Result<()> {
53        Self::save_source_metadata(self, alias, metadata)
54    }
55
56    fn index_path(&self, alias: &str) -> Result<PathBuf> {
57        Self::index_dir(self, alias)
58    }
59
60    fn load_llms_txt(&self, alias: &str) -> Result<String> {
61        Self::load_llms_txt(self, alias)
62    }
63}
64
65/// Interface for indexing refreshed content.
66pub trait RefreshIndexer {
67    /// Index a set of heading blocks for the given alias.
68    fn index(
69        &self,
70        alias: &str,
71        index_path: &std::path::Path,
72        metrics: PerformanceMetrics,
73        blocks: &[crate::HeadingBlock],
74    ) -> Result<()>;
75}
76
77/// Default indexer that writes to the Tantivy search index.
78#[derive(Default)]
79pub struct DefaultRefreshIndexer;
80
81impl RefreshIndexer for DefaultRefreshIndexer {
82    fn index(
83        &self,
84        alias: &str,
85        index_path: &std::path::Path,
86        metrics: PerformanceMetrics,
87        blocks: &[crate::HeadingBlock],
88    ) -> Result<()> {
89        let index = SearchIndex::create_or_open(index_path)?.with_metrics(metrics);
90        index.index_blocks(alias, blocks)
91    }
92}
93
94/// Result summary for a refresh operation.
95#[derive(Debug, Clone, PartialEq, Eq)]
96pub enum RefreshOutcome {
97    /// The source content changed and was re-indexed.
98    Refreshed {
99        /// Canonical alias for the refreshed source.
100        alias: String,
101        /// Number of headings indexed.
102        headings: usize,
103        /// Total line count in the source.
104        lines: usize,
105    },
106    /// The source content was unchanged.
107    Unchanged {
108        /// Canonical alias for the unchanged source.
109        alias: String,
110    },
111}
112
113/// Result summary for a reindex operation.
114#[derive(Debug, Clone, PartialEq, Eq)]
115pub struct ReindexOutcome {
116    /// Canonical alias for the reindexed source.
117    pub alias: String,
118    /// Heading count before filtering.
119    pub headings_before: usize,
120    /// Heading count after filtering.
121    pub headings_after: usize,
122    /// Number of headings filtered out.
123    pub filtered: usize,
124}
125
126/// Data describing remote changes.
127#[derive(Debug, Clone)]
128pub struct RefreshPayload {
129    /// Refreshed llms.txt content.
130    pub content: String,
131    /// SHA256 hash of the refreshed content.
132    pub sha256: String,
133    /// `ETag` header value from the response.
134    pub etag: Option<String>,
135    /// Last-Modified header value from the response.
136    pub last_modified: Option<String>,
137}
138
139/// URL resolution details for refresh operations.
140#[derive(Debug, Clone, PartialEq, Eq)]
141pub struct RefreshUrlResolution {
142    /// Final URL used for refresh.
143    pub final_url: String,
144    /// Variant chosen for the URL.
145    pub variant: crate::SourceVariant,
146    /// Whether the source was upgraded to llms-full.txt.
147    pub upgraded: bool,
148}
149
150/// Resolve the best refresh URL (llms.txt → llms-full.txt) when available.
151pub async fn resolve_refresh_url(
152    fetcher: &Fetcher,
153    metadata: &Source,
154) -> Result<RefreshUrlResolution> {
155    if metadata.variant != crate::SourceVariant::Llms {
156        return Ok(RefreshUrlResolution {
157            final_url: metadata.url.clone(),
158            variant: metadata.variant.clone(),
159            upgraded: false,
160        });
161    }
162
163    match resolve_best_url(fetcher, &metadata.url).await {
164        Ok(resolved) if resolved.variant == crate::SourceVariant::LlmsFull => {
165            Ok(RefreshUrlResolution {
166                final_url: resolved.final_url,
167                variant: resolved.variant,
168                upgraded: true,
169            })
170        },
171        Ok(_) | Err(_) => Ok(RefreshUrlResolution {
172            final_url: metadata.url.clone(),
173            variant: metadata.variant.clone(),
174            upgraded: false,
175        }),
176    }
177}
178
179/// Refresh a source using its current metadata.
180pub async fn refresh_source<S, I>(
181    storage: &S,
182    fetcher: &Fetcher,
183    alias: &str,
184    metrics: PerformanceMetrics,
185    indexer: &I,
186    filter_preference: bool,
187) -> Result<RefreshOutcome>
188where
189    S: RefreshStorage + Sync,
190    I: RefreshIndexer + Sync,
191{
192    let existing_metadata = storage.load_metadata(alias)?;
193    let existing_aliases = storage.load_llms_aliases(alias)?;
194    let resolution = resolve_refresh_url(fetcher, &existing_metadata).await?;
195
196    refresh_source_with_metadata(
197        storage,
198        fetcher,
199        alias,
200        existing_metadata,
201        existing_aliases,
202        &resolution,
203        metrics,
204        indexer,
205        filter_preference,
206    )
207    .await
208}
209
210/// Refresh a source using preloaded metadata and URL resolution.
211#[allow(clippy::too_many_arguments)]
212pub async fn refresh_source_with_metadata<S, I>(
213    storage: &S,
214    fetcher: &Fetcher,
215    alias: &str,
216    existing_metadata: Source,
217    existing_aliases: Vec<String>,
218    resolution: &RefreshUrlResolution,
219    metrics: PerformanceMetrics,
220    indexer: &I,
221    filter_preference: bool,
222) -> Result<RefreshOutcome>
223where
224    S: RefreshStorage + Sync,
225    I: RefreshIndexer + Sync,
226{
227    let fetch_result = fetcher
228        .fetch_with_cache(
229            &resolution.final_url,
230            existing_metadata.etag.as_deref(),
231            existing_metadata.last_modified.as_deref(),
232        )
233        .await?;
234
235    match fetch_result {
236        FetchResult::NotModified { .. } => {
237            if existing_metadata.filter_non_english.unwrap_or(true) != filter_preference {
238                let mut updated_metadata = existing_metadata.clone();
239                updated_metadata.filter_non_english = Some(filter_preference);
240                storage.save_metadata(alias, &updated_metadata)?;
241            }
242            Ok(RefreshOutcome::Unchanged {
243                alias: alias.to_string(),
244            })
245        },
246        FetchResult::Modified {
247            content,
248            sha256,
249            etag,
250            last_modified,
251        } => {
252            let payload = RefreshPayload {
253                content,
254                sha256,
255                etag,
256                last_modified,
257            };
258
259            let mut updated_metadata = existing_metadata.clone();
260            updated_metadata.url.clone_from(&resolution.final_url);
261            updated_metadata.variant = resolution.variant.clone();
262            updated_metadata.filter_non_english = Some(filter_preference);
263
264            apply_refresh(
265                storage,
266                alias,
267                updated_metadata,
268                existing_aliases,
269                payload,
270                metrics,
271                indexer,
272            )
273        },
274    }
275}
276
277/// Re-parse and re-index a source using cached content.
278pub fn reindex_source<S, I>(
279    storage: &S,
280    alias: &str,
281    metrics: PerformanceMetrics,
282    indexer: &I,
283    filter_preference: bool,
284) -> Result<ReindexOutcome>
285where
286    S: RefreshStorage,
287    I: RefreshIndexer,
288{
289    let content = storage.load_llms_txt(alias)?;
290    let mut parser = MarkdownParser::new()?;
291    let mut parse_result = parser.parse(&content)?;
292
293    let before_count = parse_result.heading_blocks.len();
294    apply_language_filter(&mut parse_result, filter_preference);
295    let after_count = parse_result.heading_blocks.len();
296
297    let index_path = storage.index_path(alias)?;
298    indexer.index(
299        alias,
300        index_path.as_path(),
301        metrics,
302        &parse_result.heading_blocks,
303    )?;
304
305    Ok(ReindexOutcome {
306        alias: alias.to_string(),
307        headings_before: before_count,
308        headings_after: after_count,
309        filtered: before_count.saturating_sub(after_count),
310    })
311}
312
313/// Apply a refresh: persist content and re-index the source.
314#[allow(clippy::too_many_arguments)]
315#[allow(clippy::too_many_lines)]
316pub fn apply_refresh<S, I>(
317    storage: &S,
318    alias: &str,
319    existing_metadata: Source,
320    existing_aliases: Vec<String>,
321    payload: RefreshPayload,
322    metrics: PerformanceMetrics,
323    indexer: &I,
324) -> Result<RefreshOutcome>
325where
326    S: RefreshStorage,
327    I: RefreshIndexer,
328{
329    let mut parser = MarkdownParser::new()?;
330    let mut parse_result = parser.parse(&payload.content)?;
331
332    let filter_enabled = existing_metadata.filter_non_english.unwrap_or(true);
333    let filter_stats = Some(apply_language_filter(&mut parse_result, filter_enabled));
334
335    storage.save_llms_txt(alias, &payload.content)?;
336
337    let mut llms_json = build_llms_json(
338        alias,
339        &existing_metadata.url,
340        "llms.txt",
341        payload.sha256.clone(),
342        payload.etag.clone(),
343        payload.last_modified.clone(),
344        &parse_result,
345    );
346
347    let mut metadata_aliases = existing_aliases;
348    for alias_value in &existing_metadata.aliases {
349        if !metadata_aliases.contains(alias_value) {
350            metadata_aliases.push(alias_value.clone());
351        }
352    }
353    metadata_aliases.sort();
354    metadata_aliases.dedup();
355    llms_json.metadata.aliases = metadata_aliases;
356    llms_json.metadata.tags.clone_from(&existing_metadata.tags);
357    llms_json
358        .metadata
359        .description
360        .clone_from(&existing_metadata.description);
361    llms_json
362        .metadata
363        .category
364        .clone_from(&existing_metadata.category);
365    llms_json
366        .metadata
367        .npm_aliases
368        .clone_from(&existing_metadata.npm_aliases);
369    llms_json
370        .metadata
371        .github_aliases
372        .clone_from(&existing_metadata.github_aliases);
373    llms_json.metadata.variant = existing_metadata.variant.clone();
374    llms_json.filter_stats = filter_stats;
375
376    storage.save_llms_json(alias, &llms_json)?;
377
378    let mut origin = existing_metadata.origin.clone();
379    origin.source_type = match (&origin.source_type, &existing_metadata.origin.source_type) {
380        (Some(SourceType::Remote { .. }), _) | (None, None) => Some(SourceType::Remote {
381            url: existing_metadata.url.clone(),
382        }),
383        (Some(SourceType::LocalFile { path }), _) => {
384            Some(SourceType::LocalFile { path: path.clone() })
385        },
386        (None, Some(existing)) => Some(existing.clone()),
387    };
388
389    llms_json.metadata.origin = origin.clone();
390
391    let metadata = Source {
392        url: existing_metadata.url,
393        etag: payload.etag,
394        last_modified: payload.last_modified,
395        fetched_at: chrono::Utc::now(),
396        sha256: payload.sha256,
397        variant: existing_metadata.variant,
398        aliases: existing_metadata.aliases,
399        tags: existing_metadata.tags,
400        description: existing_metadata.description,
401        category: existing_metadata.category,
402        npm_aliases: existing_metadata.npm_aliases,
403        github_aliases: existing_metadata.github_aliases,
404        origin,
405        filter_non_english: existing_metadata.filter_non_english,
406    };
407    storage.save_metadata(alias, &metadata)?;
408
409    let index_path = storage.index_path(alias)?;
410    indexer.index(
411        alias,
412        index_path.as_path(),
413        metrics,
414        &parse_result.heading_blocks,
415    )?;
416
417    Ok(RefreshOutcome::Refreshed {
418        alias: alias.to_string(),
419        headings: count_headings(&llms_json.toc),
420        lines: llms_json.line_index.total_lines,
421    })
422}
423
424fn apply_language_filter(
425    parse_result: &mut ParseResult,
426    filter_enabled: bool,
427) -> HeadingFilterStats {
428    let original_count = parse_result.heading_blocks.len();
429    if filter_enabled {
430        let mut language_filter = LanguageFilter::new(true);
431        parse_result.heading_blocks.retain(|block| {
432            let urls_in_content = extract_urls_from_content(&block.content);
433            let url_check = urls_in_content.is_empty()
434                || urls_in_content
435                    .iter()
436                    .all(|url| language_filter.is_english_url(url));
437
438            let heading_check = language_filter.is_english_heading_path(&block.path);
439
440            url_check && heading_check
441        });
442    }
443
444    let accepted = parse_result.heading_blocks.len();
445    let filtered_count = original_count.saturating_sub(accepted);
446    HeadingFilterStats {
447        enabled: filter_enabled,
448        headings_total: original_count,
449        headings_accepted: accepted,
450        headings_rejected: filtered_count,
451        reason: if filter_enabled {
452            "non-English content removed".to_string()
453        } else {
454            "filtering disabled".to_string()
455        },
456    }
457}
458
459fn count_headings(entries: &[TocEntry]) -> usize {
460    entries
461        .iter()
462        .map(|entry| 1 + count_headings(&entry.children))
463        .sum()
464}
465
466fn extract_urls_from_content(content: &str) -> Vec<String> {
467    let mut urls = Vec::new();
468
469    let mut search_start = 0;
470    while let Some(rel) = content[search_start..].find('[') {
471        let open_idx = search_start + rel;
472        if let Some(close_rel) = content[open_idx + 1..].find(']') {
473            let close_idx = open_idx + 1 + close_rel;
474            let after_bracket = content.get(close_idx + 1..).unwrap_or("");
475            if let Some(rest) = after_bracket.strip_prefix('(') {
476                if let Some(paren_rel) = rest.find(')') {
477                    if let Some(cleaned) = clean_url_slice(&rest[..paren_rel]) {
478                        urls.push(cleaned.to_string());
479                    }
480                }
481            }
482        }
483        search_start = open_idx + 1;
484    }
485
486    urls
487}
488
489fn clean_url_slice(s: &str) -> Option<&str> {
490    let trimmed = s.trim();
491    if trimmed.is_empty() {
492        return None;
493    }
494
495    let trimmed = trimmed
496        .strip_prefix('"')
497        .or_else(|| trimmed.strip_prefix('\''))
498        .unwrap_or(trimmed);
499
500    let trimmed = trimmed
501        .strip_suffix('"')
502        .or_else(|| trimmed.strip_suffix('\''))
503        .unwrap_or(trimmed);
504
505    let mut end = trimmed.len();
506    for (idx, ch) in trimmed.char_indices().rev() {
507        if trailing_punctuation(ch) {
508            end = idx;
509        } else {
510            break;
511        }
512    }
513
514    if end == 0 {
515        None
516    } else {
517        Some(&trimmed[..end])
518    }
519}
520
521const fn trailing_punctuation(c: char) -> bool {
522    matches!(c, ',' | '.' | ';' | ':' | '!' | '?')
523}
524
525#[cfg(test)]
526mod tests {
527    use super::*;
528    use std::cell::RefCell;
529    use std::collections::HashMap;
530
531    use anyhow::Result;
532
533    #[derive(Default)]
534    struct MockStorage {
535        metadata: HashMap<String, Source>,
536        saved_txt: RefCell<Vec<String>>,
537        saved_json: RefCell<Vec<String>>,
538        saved_metadata: RefCell<Vec<Source>>,
539        index_paths: HashMap<String, PathBuf>,
540        cached_txt: HashMap<String, String>,
541    }
542
543    impl RefreshStorage for MockStorage {
544        fn load_metadata(&self, alias: &str) -> crate::Result<Source> {
545            self.metadata
546                .get(alias)
547                .cloned()
548                .ok_or_else(|| crate::Error::NotFound("missing metadata".to_string()))
549        }
550
551        fn load_llms_aliases(&self, _alias: &str) -> crate::Result<Vec<String>> {
552            Ok(Vec::new())
553        }
554
555        fn save_llms_txt(&self, alias: &str, _content: &str) -> crate::Result<()> {
556            self.saved_txt.borrow_mut().push(alias.to_string());
557            Ok(())
558        }
559
560        fn save_llms_json(&self, alias: &str, _data: &crate::LlmsJson) -> crate::Result<()> {
561            self.saved_json.borrow_mut().push(alias.to_string());
562            Ok(())
563        }
564
565        fn save_metadata(&self, _alias: &str, metadata: &Source) -> crate::Result<()> {
566            self.saved_metadata.borrow_mut().push(metadata.clone());
567            Ok(())
568        }
569
570        fn index_path(&self, alias: &str) -> crate::Result<PathBuf> {
571            self.index_paths
572                .get(alias)
573                .cloned()
574                .ok_or_else(|| crate::Error::NotFound("missing index path".to_string()))
575        }
576
577        fn load_llms_txt(&self, alias: &str) -> crate::Result<String> {
578            self.cached_txt
579                .get(alias)
580                .cloned()
581                .ok_or_else(|| crate::Error::NotFound(format!("missing llms.txt for {alias}")))
582        }
583    }
584
585    #[derive(Default)]
586    struct MockIndexer {
587        indexed: RefCell<Vec<String>>,
588    }
589
590    impl RefreshIndexer for MockIndexer {
591        fn index(
592            &self,
593            alias: &str,
594            _index_path: &std::path::Path,
595            _metrics: PerformanceMetrics,
596            _blocks: &[crate::HeadingBlock],
597        ) -> crate::Result<()> {
598            self.indexed.borrow_mut().push(alias.to_string());
599            Ok(())
600        }
601    }
602
603    fn sample_source() -> Source {
604        Source {
605            url: "https://example.com/llms.txt".to_string(),
606            etag: None,
607            last_modified: None,
608            fetched_at: chrono::Utc::now(),
609            sha256: "abc123".to_string(),
610            variant: crate::SourceVariant::Llms,
611            aliases: Vec::new(),
612            tags: Vec::new(),
613            description: None,
614            category: None,
615            npm_aliases: Vec::new(),
616            github_aliases: Vec::new(),
617            origin: crate::SourceOrigin {
618                manifest: None,
619                source_type: Some(SourceType::Remote {
620                    url: "https://example.com/llms.txt".to_string(),
621                }),
622            },
623            filter_non_english: Some(true),
624        }
625    }
626
627    fn sample_payload() -> RefreshPayload {
628        RefreshPayload {
629            content: "# Title\n\nSome content.\n".to_string(),
630            sha256: "abc123".to_string(),
631            etag: None,
632            last_modified: None,
633        }
634    }
635
636    #[test]
637    fn apply_refresh_persists_changes() -> Result<()> {
638        let mut storage = MockStorage::default();
639        storage.metadata.insert("test".to_string(), sample_source());
640        storage
641            .index_paths
642            .insert("test".to_string(), PathBuf::from("index"));
643
644        let indexer = MockIndexer::default();
645        let outcome = apply_refresh(
646            &storage,
647            "test",
648            sample_source(),
649            Vec::new(),
650            sample_payload(),
651            PerformanceMetrics::default(),
652            &indexer,
653        )?;
654
655        assert!(matches!(outcome, RefreshOutcome::Refreshed { .. }));
656        assert_eq!(storage.saved_txt.borrow().len(), 1);
657        assert_eq!(storage.saved_json.borrow().len(), 1);
658        assert_eq!(storage.saved_metadata.borrow().len(), 1);
659        assert_eq!(indexer.indexed.borrow().len(), 1);
660        Ok(())
661    }
662}