1use std::path::PathBuf;
4
5use crate::{
6 FetchResult, Fetcher, HeadingFilterStats, LanguageFilter, MarkdownParser, ParseResult,
7 PerformanceMetrics, Result, SearchIndex, Source, SourceType, Storage, TocEntry,
8};
9
10use crate::json_builder::build_llms_json;
11use crate::url_resolver::resolve_best_url;
12
13pub trait RefreshStorage {
15 fn load_metadata(&self, alias: &str) -> Result<Source>;
17 fn load_llms_aliases(&self, alias: &str) -> Result<Vec<String>>;
19 fn save_llms_txt(&self, alias: &str, content: &str) -> Result<()>;
21 fn save_llms_json(&self, alias: &str, data: &crate::LlmsJson) -> Result<()>;
23 fn save_metadata(&self, alias: &str, metadata: &Source) -> Result<()>;
25 fn index_path(&self, alias: &str) -> Result<PathBuf>;
27 fn load_llms_txt(&self, alias: &str) -> Result<String>;
29}
30
31impl RefreshStorage for Storage {
32 fn load_metadata(&self, alias: &str) -> Result<Source> {
33 Self::load_source_metadata(self, alias)?
34 .ok_or_else(|| crate::Error::NotFound(format!("Missing metadata for {alias}")))
35 }
36
37 fn load_llms_aliases(&self, alias: &str) -> Result<Vec<String>> {
38 match Self::load_llms_json(self, alias) {
39 Ok(llms) => Ok(llms.metadata.aliases),
40 Err(_) => Ok(Vec::new()),
41 }
42 }
43
44 fn save_llms_txt(&self, alias: &str, content: &str) -> Result<()> {
45 Self::save_llms_txt(self, alias, content)
46 }
47
48 fn save_llms_json(&self, alias: &str, data: &crate::LlmsJson) -> Result<()> {
49 Self::save_llms_json(self, alias, data)
50 }
51
52 fn save_metadata(&self, alias: &str, metadata: &Source) -> Result<()> {
53 Self::save_source_metadata(self, alias, metadata)
54 }
55
56 fn index_path(&self, alias: &str) -> Result<PathBuf> {
57 Self::index_dir(self, alias)
58 }
59
60 fn load_llms_txt(&self, alias: &str) -> Result<String> {
61 Self::load_llms_txt(self, alias)
62 }
63}
64
65pub trait RefreshIndexer {
67 fn index(
69 &self,
70 alias: &str,
71 index_path: &std::path::Path,
72 metrics: PerformanceMetrics,
73 blocks: &[crate::HeadingBlock],
74 ) -> Result<()>;
75}
76
77#[derive(Default)]
79pub struct DefaultRefreshIndexer;
80
81impl RefreshIndexer for DefaultRefreshIndexer {
82 fn index(
83 &self,
84 alias: &str,
85 index_path: &std::path::Path,
86 metrics: PerformanceMetrics,
87 blocks: &[crate::HeadingBlock],
88 ) -> Result<()> {
89 let index = SearchIndex::create_or_open(index_path)?.with_metrics(metrics);
90 index.index_blocks(alias, blocks)
91 }
92}
93
94#[derive(Debug, Clone, PartialEq, Eq)]
96pub enum RefreshOutcome {
97 Refreshed {
99 alias: String,
101 headings: usize,
103 lines: usize,
105 },
106 Unchanged {
108 alias: String,
110 },
111}
112
113#[derive(Debug, Clone, PartialEq, Eq)]
115pub struct ReindexOutcome {
116 pub alias: String,
118 pub headings_before: usize,
120 pub headings_after: usize,
122 pub filtered: usize,
124}
125
126#[derive(Debug, Clone)]
128pub struct RefreshPayload {
129 pub content: String,
131 pub sha256: String,
133 pub etag: Option<String>,
135 pub last_modified: Option<String>,
137}
138
139#[derive(Debug, Clone, PartialEq, Eq)]
141pub struct RefreshUrlResolution {
142 pub final_url: String,
144 pub variant: crate::SourceVariant,
146 pub upgraded: bool,
148}
149
150pub async fn resolve_refresh_url(
152 fetcher: &Fetcher,
153 metadata: &Source,
154) -> Result<RefreshUrlResolution> {
155 if metadata.variant != crate::SourceVariant::Llms {
156 return Ok(RefreshUrlResolution {
157 final_url: metadata.url.clone(),
158 variant: metadata.variant.clone(),
159 upgraded: false,
160 });
161 }
162
163 match resolve_best_url(fetcher, &metadata.url).await {
164 Ok(resolved) if resolved.variant == crate::SourceVariant::LlmsFull => {
165 Ok(RefreshUrlResolution {
166 final_url: resolved.final_url,
167 variant: resolved.variant,
168 upgraded: true,
169 })
170 },
171 Ok(_) | Err(_) => Ok(RefreshUrlResolution {
172 final_url: metadata.url.clone(),
173 variant: metadata.variant.clone(),
174 upgraded: false,
175 }),
176 }
177}
178
179pub async fn refresh_source<S, I>(
185 storage: &S,
186 fetcher: &Fetcher,
187 alias: &str,
188 metrics: PerformanceMetrics,
189 indexer: &I,
190 filter_preference: bool,
191) -> Result<RefreshOutcome>
192where
193 S: RefreshStorage + Sync,
194 I: RefreshIndexer + Sync,
195{
196 let existing_metadata = storage.load_metadata(alias)?;
197 let existing_aliases = storage.load_llms_aliases(alias)?;
198 let resolution = resolve_refresh_url(fetcher, &existing_metadata).await?;
199
200 refresh_source_with_metadata(
201 storage,
202 fetcher,
203 alias,
204 existing_metadata,
205 existing_aliases,
206 &resolution,
207 metrics,
208 indexer,
209 filter_preference,
210 )
211 .await
212}
213
214#[allow(clippy::too_many_arguments)]
220pub async fn refresh_source_with_metadata<S, I>(
221 storage: &S,
222 fetcher: &Fetcher,
223 alias: &str,
224 existing_metadata: Source,
225 existing_aliases: Vec<String>,
226 resolution: &RefreshUrlResolution,
227 metrics: PerformanceMetrics,
228 indexer: &I,
229 filter_preference: bool,
230) -> Result<RefreshOutcome>
231where
232 S: RefreshStorage + Sync,
233 I: RefreshIndexer + Sync,
234{
235 let fetch_result = fetcher
236 .fetch_with_cache(
237 &resolution.final_url,
238 existing_metadata.etag.as_deref(),
239 existing_metadata.last_modified.as_deref(),
240 )
241 .await?;
242
243 match fetch_result {
244 FetchResult::NotModified { .. } => {
245 if existing_metadata.filter_non_english.unwrap_or(true) != filter_preference {
246 let mut updated_metadata = existing_metadata.clone();
247 updated_metadata.filter_non_english = Some(filter_preference);
248 storage.save_metadata(alias, &updated_metadata)?;
249 }
250 Ok(RefreshOutcome::Unchanged {
251 alias: alias.to_string(),
252 })
253 },
254 FetchResult::Modified {
255 content,
256 sha256,
257 etag,
258 last_modified,
259 } => {
260 let payload = RefreshPayload {
261 content,
262 sha256,
263 etag,
264 last_modified,
265 };
266
267 let mut updated_metadata = existing_metadata.clone();
268 updated_metadata.url.clone_from(&resolution.final_url);
269 updated_metadata.variant = resolution.variant.clone();
270 updated_metadata.filter_non_english = Some(filter_preference);
271
272 apply_refresh(
273 storage,
274 alias,
275 updated_metadata,
276 existing_aliases,
277 &payload,
278 metrics,
279 indexer,
280 )
281 },
282 }
283}
284
285pub fn reindex_source<S, I>(
291 storage: &S,
292 alias: &str,
293 metrics: PerformanceMetrics,
294 indexer: &I,
295 filter_preference: bool,
296) -> Result<ReindexOutcome>
297where
298 S: RefreshStorage,
299 I: RefreshIndexer,
300{
301 let content = storage.load_llms_txt(alias)?;
302 let mut parser = MarkdownParser::new()?;
303 let mut parse_result = parser.parse(&content)?;
304
305 let before_count = parse_result.heading_blocks.len();
306 apply_language_filter(&mut parse_result, filter_preference);
307 let after_count = parse_result.heading_blocks.len();
308
309 let index_path = storage.index_path(alias)?;
310 indexer.index(
311 alias,
312 index_path.as_path(),
313 metrics,
314 &parse_result.heading_blocks,
315 )?;
316
317 Ok(ReindexOutcome {
318 alias: alias.to_string(),
319 headings_before: before_count,
320 headings_after: after_count,
321 filtered: before_count.saturating_sub(after_count),
322 })
323}
324
325fn merge_aliases(existing_aliases: Vec<String>, metadata_aliases: &[String]) -> Vec<String> {
327 let mut merged = existing_aliases;
328 for alias_value in metadata_aliases {
329 if !merged.contains(alias_value) {
330 merged.push(alias_value.clone());
331 }
332 }
333 merged.sort();
334 merged.dedup();
335 merged
336}
337
338fn copy_preserved_metadata_fields(llms_json: &mut crate::LlmsJson, existing: &Source) {
340 llms_json.metadata.tags.clone_from(&existing.tags);
341 llms_json
342 .metadata
343 .description
344 .clone_from(&existing.description);
345 llms_json.metadata.category.clone_from(&existing.category);
346 llms_json
347 .metadata
348 .npm_aliases
349 .clone_from(&existing.npm_aliases);
350 llms_json
351 .metadata
352 .github_aliases
353 .clone_from(&existing.github_aliases);
354 llms_json.metadata.variant = existing.variant.clone();
355}
356
357fn resolve_origin(existing: &Source) -> crate::SourceOrigin {
359 let mut origin = existing.origin.clone();
360 origin.source_type = match (&origin.source_type, &existing.origin.source_type) {
361 (Some(SourceType::Remote { .. }), _) | (None, None) => Some(SourceType::Remote {
362 url: existing.url.clone(),
363 }),
364 (Some(SourceType::LocalFile { path }), _) => {
365 Some(SourceType::LocalFile { path: path.clone() })
366 },
367 (None, Some(existing_type)) => Some(existing_type.clone()),
368 };
369 origin
370}
371
372fn build_refresh_metadata(
374 existing: Source,
375 payload: &RefreshPayload,
376 origin: crate::SourceOrigin,
377) -> Source {
378 Source {
379 url: existing.url,
380 etag: payload.etag.clone(),
381 last_modified: payload.last_modified.clone(),
382 fetched_at: chrono::Utc::now(),
383 sha256: payload.sha256.clone(),
384 variant: existing.variant,
385 aliases: existing.aliases,
386 tags: existing.tags,
387 description: existing.description,
388 category: existing.category,
389 npm_aliases: existing.npm_aliases,
390 github_aliases: existing.github_aliases,
391 origin,
392 filter_non_english: existing.filter_non_english,
393 }
394}
395
396#[allow(clippy::too_many_arguments)]
402pub fn apply_refresh<S, I>(
403 storage: &S,
404 alias: &str,
405 existing_metadata: Source,
406 existing_aliases: Vec<String>,
407 payload: &RefreshPayload,
408 metrics: PerformanceMetrics,
409 indexer: &I,
410) -> Result<RefreshOutcome>
411where
412 S: RefreshStorage,
413 I: RefreshIndexer,
414{
415 let mut parser = MarkdownParser::new()?;
416 let mut parse_result = parser.parse(&payload.content)?;
417
418 let filter_enabled = existing_metadata.filter_non_english.unwrap_or(true);
419 let filter_stats = Some(apply_language_filter(&mut parse_result, filter_enabled));
420
421 storage.save_llms_txt(alias, &payload.content)?;
422
423 let mut llms_json = build_llms_json(
424 alias,
425 &existing_metadata.url,
426 "llms.txt",
427 payload.sha256.clone(),
428 payload.etag.clone(),
429 payload.last_modified.clone(),
430 &parse_result,
431 );
432
433 llms_json.metadata.aliases = merge_aliases(existing_aliases, &existing_metadata.aliases);
434 copy_preserved_metadata_fields(&mut llms_json, &existing_metadata);
435 llms_json.filter_stats = filter_stats;
436
437 storage.save_llms_json(alias, &llms_json)?;
438
439 let origin = resolve_origin(&existing_metadata);
440 llms_json.metadata.origin = origin.clone();
441
442 let metadata = build_refresh_metadata(existing_metadata, payload, origin);
443 storage.save_metadata(alias, &metadata)?;
444
445 let index_path = storage.index_path(alias)?;
446 indexer.index(
447 alias,
448 index_path.as_path(),
449 metrics,
450 &parse_result.heading_blocks,
451 )?;
452
453 Ok(RefreshOutcome::Refreshed {
454 alias: alias.to_string(),
455 headings: count_headings(&llms_json.toc),
456 lines: llms_json.line_index.total_lines,
457 })
458}
459
460fn apply_language_filter(
461 parse_result: &mut ParseResult,
462 filter_enabled: bool,
463) -> HeadingFilterStats {
464 let original_count = parse_result.heading_blocks.len();
465 if filter_enabled {
466 let mut language_filter = LanguageFilter::new(true);
467 parse_result.heading_blocks.retain(|block| {
468 let urls_in_content = extract_urls_from_content(&block.content);
469 let url_check = urls_in_content.is_empty()
470 || urls_in_content
471 .iter()
472 .all(|url| language_filter.is_english_url(url));
473
474 let heading_check = language_filter.is_english_heading_path(&block.path);
475
476 url_check && heading_check
477 });
478 }
479
480 let accepted = parse_result.heading_blocks.len();
481 let filtered_count = original_count.saturating_sub(accepted);
482 HeadingFilterStats {
483 enabled: filter_enabled,
484 headings_total: original_count,
485 headings_accepted: accepted,
486 headings_rejected: filtered_count,
487 reason: if filter_enabled {
488 "non-English content removed".to_string()
489 } else {
490 "filtering disabled".to_string()
491 },
492 }
493}
494
495fn count_headings(entries: &[TocEntry]) -> usize {
496 entries
497 .iter()
498 .map(|entry| 1 + count_headings(&entry.children))
499 .sum()
500}
501
502fn extract_urls_from_content(content: &str) -> Vec<String> {
503 let mut urls = Vec::new();
504
505 let mut search_start = 0;
506 while let Some(rel) = content[search_start..].find('[') {
507 let open_idx = search_start + rel;
508 if let Some(close_rel) = content[open_idx + 1..].find(']') {
509 let close_idx = open_idx + 1 + close_rel;
510 let after_bracket = content.get(close_idx + 1..).unwrap_or("");
511 if let Some(rest) = after_bracket.strip_prefix('(') {
512 if let Some(paren_rel) = rest.find(')') {
513 if let Some(cleaned) = clean_url_slice(&rest[..paren_rel]) {
514 urls.push(cleaned.to_string());
515 }
516 }
517 }
518 }
519 search_start = open_idx + 1;
520 }
521
522 urls
523}
524
525fn clean_url_slice(s: &str) -> Option<&str> {
526 let trimmed = s.trim();
527 if trimmed.is_empty() {
528 return None;
529 }
530
531 let trimmed = trimmed
532 .strip_prefix('"')
533 .or_else(|| trimmed.strip_prefix('\''))
534 .unwrap_or(trimmed);
535
536 let trimmed = trimmed
537 .strip_suffix('"')
538 .or_else(|| trimmed.strip_suffix('\''))
539 .unwrap_or(trimmed);
540
541 let mut end = trimmed.len();
542 for (idx, ch) in trimmed.char_indices().rev() {
543 if trailing_punctuation(ch) {
544 end = idx;
545 } else {
546 break;
547 }
548 }
549
550 if end == 0 {
551 None
552 } else {
553 Some(&trimmed[..end])
554 }
555}
556
557const fn trailing_punctuation(c: char) -> bool {
558 matches!(c, ',' | '.' | ';' | ':' | '!' | '?')
559}
560
561#[cfg(test)]
562mod tests {
563 use super::*;
564 use std::cell::RefCell;
565 use std::collections::HashMap;
566
567 use anyhow::Result;
568
569 #[derive(Default)]
570 struct MockStorage {
571 metadata: HashMap<String, Source>,
572 saved_txt: RefCell<Vec<String>>,
573 saved_json: RefCell<Vec<String>>,
574 saved_metadata: RefCell<Vec<Source>>,
575 index_paths: HashMap<String, PathBuf>,
576 cached_txt: HashMap<String, String>,
577 }
578
579 impl RefreshStorage for MockStorage {
580 fn load_metadata(&self, alias: &str) -> crate::Result<Source> {
581 self.metadata
582 .get(alias)
583 .cloned()
584 .ok_or_else(|| crate::Error::NotFound("missing metadata".to_string()))
585 }
586
587 fn load_llms_aliases(&self, _alias: &str) -> crate::Result<Vec<String>> {
588 Ok(Vec::new())
589 }
590
591 fn save_llms_txt(&self, alias: &str, _content: &str) -> crate::Result<()> {
592 self.saved_txt.borrow_mut().push(alias.to_string());
593 Ok(())
594 }
595
596 fn save_llms_json(&self, alias: &str, _data: &crate::LlmsJson) -> crate::Result<()> {
597 self.saved_json.borrow_mut().push(alias.to_string());
598 Ok(())
599 }
600
601 fn save_metadata(&self, _alias: &str, metadata: &Source) -> crate::Result<()> {
602 self.saved_metadata.borrow_mut().push(metadata.clone());
603 Ok(())
604 }
605
606 fn index_path(&self, alias: &str) -> crate::Result<PathBuf> {
607 self.index_paths
608 .get(alias)
609 .cloned()
610 .ok_or_else(|| crate::Error::NotFound("missing index path".to_string()))
611 }
612
613 fn load_llms_txt(&self, alias: &str) -> crate::Result<String> {
614 self.cached_txt
615 .get(alias)
616 .cloned()
617 .ok_or_else(|| crate::Error::NotFound(format!("missing llms.txt for {alias}")))
618 }
619 }
620
621 #[derive(Default)]
622 struct MockIndexer {
623 indexed: RefCell<Vec<String>>,
624 }
625
626 impl RefreshIndexer for MockIndexer {
627 fn index(
628 &self,
629 alias: &str,
630 _index_path: &std::path::Path,
631 _metrics: PerformanceMetrics,
632 _blocks: &[crate::HeadingBlock],
633 ) -> crate::Result<()> {
634 self.indexed.borrow_mut().push(alias.to_string());
635 Ok(())
636 }
637 }
638
639 fn sample_source() -> Source {
640 Source {
641 url: "https://example.com/llms.txt".to_string(),
642 etag: None,
643 last_modified: None,
644 fetched_at: chrono::Utc::now(),
645 sha256: "abc123".to_string(),
646 variant: crate::SourceVariant::Llms,
647 aliases: Vec::new(),
648 tags: Vec::new(),
649 description: None,
650 category: None,
651 npm_aliases: Vec::new(),
652 github_aliases: Vec::new(),
653 origin: crate::SourceOrigin {
654 manifest: None,
655 source_type: Some(SourceType::Remote {
656 url: "https://example.com/llms.txt".to_string(),
657 }),
658 },
659 filter_non_english: Some(true),
660 }
661 }
662
663 fn sample_payload() -> RefreshPayload {
664 RefreshPayload {
665 content: "# Title\n\nSome content.\n".to_string(),
666 sha256: "abc123".to_string(),
667 etag: None,
668 last_modified: None,
669 }
670 }
671
672 #[test]
673 fn apply_refresh_persists_changes() -> Result<()> {
674 let mut storage = MockStorage::default();
675 storage.metadata.insert("test".to_string(), sample_source());
676 storage
677 .index_paths
678 .insert("test".to_string(), PathBuf::from("index"));
679
680 let indexer = MockIndexer::default();
681 let outcome = apply_refresh(
682 &storage,
683 "test",
684 sample_source(),
685 Vec::new(),
686 &sample_payload(),
687 PerformanceMetrics::default(),
688 &indexer,
689 )?;
690
691 assert!(matches!(outcome, RefreshOutcome::Refreshed { .. }));
692 assert_eq!(storage.saved_txt.borrow().len(), 1);
693 assert_eq!(storage.saved_json.borrow().len(), 1);
694 assert_eq!(storage.saved_metadata.borrow().len(), 1);
695 assert_eq!(indexer.indexed.borrow().len(), 1);
696 Ok(())
697 }
698}