1use std::path::PathBuf;
4
5use crate::{
6 FetchResult, Fetcher, HeadingFilterStats, LanguageFilter, MarkdownParser, ParseResult,
7 PerformanceMetrics, Result, SearchIndex, Source, SourceType, Storage, TocEntry,
8};
9
10use crate::json_builder::build_llms_json;
11use crate::url_resolver::resolve_best_url;
12
13pub trait RefreshStorage {
15 fn load_metadata(&self, alias: &str) -> Result<Source>;
17 fn load_llms_aliases(&self, alias: &str) -> Result<Vec<String>>;
19 fn save_llms_txt(&self, alias: &str, content: &str) -> Result<()>;
21 fn save_llms_json(&self, alias: &str, data: &crate::LlmsJson) -> Result<()>;
23 fn save_metadata(&self, alias: &str, metadata: &Source) -> Result<()>;
25 fn index_path(&self, alias: &str) -> Result<PathBuf>;
27 fn load_llms_txt(&self, alias: &str) -> Result<String>;
29}
30
31impl RefreshStorage for Storage {
32 fn load_metadata(&self, alias: &str) -> Result<Source> {
33 Self::load_source_metadata(self, alias)?
34 .ok_or_else(|| crate::Error::NotFound(format!("Missing metadata for {alias}")))
35 }
36
37 fn load_llms_aliases(&self, alias: &str) -> Result<Vec<String>> {
38 match Self::load_llms_json(self, alias) {
39 Ok(llms) => Ok(llms.metadata.aliases),
40 Err(_) => Ok(Vec::new()),
41 }
42 }
43
44 fn save_llms_txt(&self, alias: &str, content: &str) -> Result<()> {
45 Self::save_llms_txt(self, alias, content)
46 }
47
48 fn save_llms_json(&self, alias: &str, data: &crate::LlmsJson) -> Result<()> {
49 Self::save_llms_json(self, alias, data)
50 }
51
52 fn save_metadata(&self, alias: &str, metadata: &Source) -> Result<()> {
53 Self::save_source_metadata(self, alias, metadata)
54 }
55
56 fn index_path(&self, alias: &str) -> Result<PathBuf> {
57 Self::index_dir(self, alias)
58 }
59
60 fn load_llms_txt(&self, alias: &str) -> Result<String> {
61 Self::load_llms_txt(self, alias)
62 }
63}
64
65pub trait RefreshIndexer {
67 fn index(
69 &self,
70 alias: &str,
71 index_path: &std::path::Path,
72 metrics: PerformanceMetrics,
73 blocks: &[crate::HeadingBlock],
74 ) -> Result<()>;
75}
76
77#[derive(Default)]
79pub struct DefaultRefreshIndexer;
80
81impl RefreshIndexer for DefaultRefreshIndexer {
82 fn index(
83 &self,
84 alias: &str,
85 index_path: &std::path::Path,
86 metrics: PerformanceMetrics,
87 blocks: &[crate::HeadingBlock],
88 ) -> Result<()> {
89 let index = SearchIndex::create_or_open(index_path)?.with_metrics(metrics);
90 index.index_blocks(alias, blocks)
91 }
92}
93
94#[derive(Debug, Clone, PartialEq, Eq)]
96pub enum RefreshOutcome {
97 Refreshed {
99 alias: String,
101 headings: usize,
103 lines: usize,
105 },
106 Unchanged {
108 alias: String,
110 },
111}
112
113#[derive(Debug, Clone, PartialEq, Eq)]
115pub struct ReindexOutcome {
116 pub alias: String,
118 pub headings_before: usize,
120 pub headings_after: usize,
122 pub filtered: usize,
124}
125
126#[derive(Debug, Clone)]
128pub struct RefreshPayload {
129 pub content: String,
131 pub sha256: String,
133 pub etag: Option<String>,
135 pub last_modified: Option<String>,
137}
138
139#[derive(Debug, Clone, PartialEq, Eq)]
141pub struct RefreshUrlResolution {
142 pub final_url: String,
144 pub variant: crate::SourceVariant,
146 pub upgraded: bool,
148}
149
150pub async fn resolve_refresh_url(
152 fetcher: &Fetcher,
153 metadata: &Source,
154) -> Result<RefreshUrlResolution> {
155 if metadata.variant != crate::SourceVariant::Llms {
156 return Ok(RefreshUrlResolution {
157 final_url: metadata.url.clone(),
158 variant: metadata.variant.clone(),
159 upgraded: false,
160 });
161 }
162
163 match resolve_best_url(fetcher, &metadata.url).await {
164 Ok(resolved) if resolved.variant == crate::SourceVariant::LlmsFull => {
165 Ok(RefreshUrlResolution {
166 final_url: resolved.final_url,
167 variant: resolved.variant,
168 upgraded: true,
169 })
170 },
171 Ok(_) | Err(_) => Ok(RefreshUrlResolution {
172 final_url: metadata.url.clone(),
173 variant: metadata.variant.clone(),
174 upgraded: false,
175 }),
176 }
177}
178
179pub async fn refresh_source<S, I>(
181 storage: &S,
182 fetcher: &Fetcher,
183 alias: &str,
184 metrics: PerformanceMetrics,
185 indexer: &I,
186 filter_preference: bool,
187) -> Result<RefreshOutcome>
188where
189 S: RefreshStorage + Sync,
190 I: RefreshIndexer + Sync,
191{
192 let existing_metadata = storage.load_metadata(alias)?;
193 let existing_aliases = storage.load_llms_aliases(alias)?;
194 let resolution = resolve_refresh_url(fetcher, &existing_metadata).await?;
195
196 refresh_source_with_metadata(
197 storage,
198 fetcher,
199 alias,
200 existing_metadata,
201 existing_aliases,
202 &resolution,
203 metrics,
204 indexer,
205 filter_preference,
206 )
207 .await
208}
209
210#[allow(clippy::too_many_arguments)]
212pub async fn refresh_source_with_metadata<S, I>(
213 storage: &S,
214 fetcher: &Fetcher,
215 alias: &str,
216 existing_metadata: Source,
217 existing_aliases: Vec<String>,
218 resolution: &RefreshUrlResolution,
219 metrics: PerformanceMetrics,
220 indexer: &I,
221 filter_preference: bool,
222) -> Result<RefreshOutcome>
223where
224 S: RefreshStorage + Sync,
225 I: RefreshIndexer + Sync,
226{
227 let fetch_result = fetcher
228 .fetch_with_cache(
229 &resolution.final_url,
230 existing_metadata.etag.as_deref(),
231 existing_metadata.last_modified.as_deref(),
232 )
233 .await?;
234
235 match fetch_result {
236 FetchResult::NotModified { .. } => {
237 if existing_metadata.filter_non_english.unwrap_or(true) != filter_preference {
238 let mut updated_metadata = existing_metadata.clone();
239 updated_metadata.filter_non_english = Some(filter_preference);
240 storage.save_metadata(alias, &updated_metadata)?;
241 }
242 Ok(RefreshOutcome::Unchanged {
243 alias: alias.to_string(),
244 })
245 },
246 FetchResult::Modified {
247 content,
248 sha256,
249 etag,
250 last_modified,
251 } => {
252 let payload = RefreshPayload {
253 content,
254 sha256,
255 etag,
256 last_modified,
257 };
258
259 let mut updated_metadata = existing_metadata.clone();
260 updated_metadata.url.clone_from(&resolution.final_url);
261 updated_metadata.variant = resolution.variant.clone();
262 updated_metadata.filter_non_english = Some(filter_preference);
263
264 apply_refresh(
265 storage,
266 alias,
267 updated_metadata,
268 existing_aliases,
269 payload,
270 metrics,
271 indexer,
272 )
273 },
274 }
275}
276
277pub fn reindex_source<S, I>(
279 storage: &S,
280 alias: &str,
281 metrics: PerformanceMetrics,
282 indexer: &I,
283 filter_preference: bool,
284) -> Result<ReindexOutcome>
285where
286 S: RefreshStorage,
287 I: RefreshIndexer,
288{
289 let content = storage.load_llms_txt(alias)?;
290 let mut parser = MarkdownParser::new()?;
291 let mut parse_result = parser.parse(&content)?;
292
293 let before_count = parse_result.heading_blocks.len();
294 apply_language_filter(&mut parse_result, filter_preference);
295 let after_count = parse_result.heading_blocks.len();
296
297 let index_path = storage.index_path(alias)?;
298 indexer.index(
299 alias,
300 index_path.as_path(),
301 metrics,
302 &parse_result.heading_blocks,
303 )?;
304
305 Ok(ReindexOutcome {
306 alias: alias.to_string(),
307 headings_before: before_count,
308 headings_after: after_count,
309 filtered: before_count.saturating_sub(after_count),
310 })
311}
312
313#[allow(clippy::too_many_arguments)]
315#[allow(clippy::too_many_lines)]
316pub fn apply_refresh<S, I>(
317 storage: &S,
318 alias: &str,
319 existing_metadata: Source,
320 existing_aliases: Vec<String>,
321 payload: RefreshPayload,
322 metrics: PerformanceMetrics,
323 indexer: &I,
324) -> Result<RefreshOutcome>
325where
326 S: RefreshStorage,
327 I: RefreshIndexer,
328{
329 let mut parser = MarkdownParser::new()?;
330 let mut parse_result = parser.parse(&payload.content)?;
331
332 let filter_enabled = existing_metadata.filter_non_english.unwrap_or(true);
333 let filter_stats = Some(apply_language_filter(&mut parse_result, filter_enabled));
334
335 storage.save_llms_txt(alias, &payload.content)?;
336
337 let mut llms_json = build_llms_json(
338 alias,
339 &existing_metadata.url,
340 "llms.txt",
341 payload.sha256.clone(),
342 payload.etag.clone(),
343 payload.last_modified.clone(),
344 &parse_result,
345 );
346
347 let mut metadata_aliases = existing_aliases;
348 for alias_value in &existing_metadata.aliases {
349 if !metadata_aliases.contains(alias_value) {
350 metadata_aliases.push(alias_value.clone());
351 }
352 }
353 metadata_aliases.sort();
354 metadata_aliases.dedup();
355 llms_json.metadata.aliases = metadata_aliases;
356 llms_json.metadata.tags.clone_from(&existing_metadata.tags);
357 llms_json
358 .metadata
359 .description
360 .clone_from(&existing_metadata.description);
361 llms_json
362 .metadata
363 .category
364 .clone_from(&existing_metadata.category);
365 llms_json
366 .metadata
367 .npm_aliases
368 .clone_from(&existing_metadata.npm_aliases);
369 llms_json
370 .metadata
371 .github_aliases
372 .clone_from(&existing_metadata.github_aliases);
373 llms_json.metadata.variant = existing_metadata.variant.clone();
374 llms_json.filter_stats = filter_stats;
375
376 storage.save_llms_json(alias, &llms_json)?;
377
378 let mut origin = existing_metadata.origin.clone();
379 origin.source_type = match (&origin.source_type, &existing_metadata.origin.source_type) {
380 (Some(SourceType::Remote { .. }), _) | (None, None) => Some(SourceType::Remote {
381 url: existing_metadata.url.clone(),
382 }),
383 (Some(SourceType::LocalFile { path }), _) => {
384 Some(SourceType::LocalFile { path: path.clone() })
385 },
386 (None, Some(existing)) => Some(existing.clone()),
387 };
388
389 llms_json.metadata.origin = origin.clone();
390
391 let metadata = Source {
392 url: existing_metadata.url,
393 etag: payload.etag,
394 last_modified: payload.last_modified,
395 fetched_at: chrono::Utc::now(),
396 sha256: payload.sha256,
397 variant: existing_metadata.variant,
398 aliases: existing_metadata.aliases,
399 tags: existing_metadata.tags,
400 description: existing_metadata.description,
401 category: existing_metadata.category,
402 npm_aliases: existing_metadata.npm_aliases,
403 github_aliases: existing_metadata.github_aliases,
404 origin,
405 filter_non_english: existing_metadata.filter_non_english,
406 };
407 storage.save_metadata(alias, &metadata)?;
408
409 let index_path = storage.index_path(alias)?;
410 indexer.index(
411 alias,
412 index_path.as_path(),
413 metrics,
414 &parse_result.heading_blocks,
415 )?;
416
417 Ok(RefreshOutcome::Refreshed {
418 alias: alias.to_string(),
419 headings: count_headings(&llms_json.toc),
420 lines: llms_json.line_index.total_lines,
421 })
422}
423
424fn apply_language_filter(
425 parse_result: &mut ParseResult,
426 filter_enabled: bool,
427) -> HeadingFilterStats {
428 let original_count = parse_result.heading_blocks.len();
429 if filter_enabled {
430 let mut language_filter = LanguageFilter::new(true);
431 parse_result.heading_blocks.retain(|block| {
432 let urls_in_content = extract_urls_from_content(&block.content);
433 let url_check = urls_in_content.is_empty()
434 || urls_in_content
435 .iter()
436 .all(|url| language_filter.is_english_url(url));
437
438 let heading_check = language_filter.is_english_heading_path(&block.path);
439
440 url_check && heading_check
441 });
442 }
443
444 let accepted = parse_result.heading_blocks.len();
445 let filtered_count = original_count.saturating_sub(accepted);
446 HeadingFilterStats {
447 enabled: filter_enabled,
448 headings_total: original_count,
449 headings_accepted: accepted,
450 headings_rejected: filtered_count,
451 reason: if filter_enabled {
452 "non-English content removed".to_string()
453 } else {
454 "filtering disabled".to_string()
455 },
456 }
457}
458
459fn count_headings(entries: &[TocEntry]) -> usize {
460 entries
461 .iter()
462 .map(|entry| 1 + count_headings(&entry.children))
463 .sum()
464}
465
466fn extract_urls_from_content(content: &str) -> Vec<String> {
467 let mut urls = Vec::new();
468
469 let mut search_start = 0;
470 while let Some(rel) = content[search_start..].find('[') {
471 let open_idx = search_start + rel;
472 if let Some(close_rel) = content[open_idx + 1..].find(']') {
473 let close_idx = open_idx + 1 + close_rel;
474 let after_bracket = content.get(close_idx + 1..).unwrap_or("");
475 if let Some(rest) = after_bracket.strip_prefix('(') {
476 if let Some(paren_rel) = rest.find(')') {
477 if let Some(cleaned) = clean_url_slice(&rest[..paren_rel]) {
478 urls.push(cleaned.to_string());
479 }
480 }
481 }
482 }
483 search_start = open_idx + 1;
484 }
485
486 urls
487}
488
489fn clean_url_slice(s: &str) -> Option<&str> {
490 let trimmed = s.trim();
491 if trimmed.is_empty() {
492 return None;
493 }
494
495 let trimmed = trimmed
496 .strip_prefix('"')
497 .or_else(|| trimmed.strip_prefix('\''))
498 .unwrap_or(trimmed);
499
500 let trimmed = trimmed
501 .strip_suffix('"')
502 .or_else(|| trimmed.strip_suffix('\''))
503 .unwrap_or(trimmed);
504
505 let mut end = trimmed.len();
506 for (idx, ch) in trimmed.char_indices().rev() {
507 if trailing_punctuation(ch) {
508 end = idx;
509 } else {
510 break;
511 }
512 }
513
514 if end == 0 {
515 None
516 } else {
517 Some(&trimmed[..end])
518 }
519}
520
521const fn trailing_punctuation(c: char) -> bool {
522 matches!(c, ',' | '.' | ';' | ':' | '!' | '?')
523}
524
525#[cfg(test)]
526mod tests {
527 use super::*;
528 use std::cell::RefCell;
529 use std::collections::HashMap;
530
531 use anyhow::Result;
532
533 #[derive(Default)]
534 struct MockStorage {
535 metadata: HashMap<String, Source>,
536 saved_txt: RefCell<Vec<String>>,
537 saved_json: RefCell<Vec<String>>,
538 saved_metadata: RefCell<Vec<Source>>,
539 index_paths: HashMap<String, PathBuf>,
540 cached_txt: HashMap<String, String>,
541 }
542
543 impl RefreshStorage for MockStorage {
544 fn load_metadata(&self, alias: &str) -> crate::Result<Source> {
545 self.metadata
546 .get(alias)
547 .cloned()
548 .ok_or_else(|| crate::Error::NotFound("missing metadata".to_string()))
549 }
550
551 fn load_llms_aliases(&self, _alias: &str) -> crate::Result<Vec<String>> {
552 Ok(Vec::new())
553 }
554
555 fn save_llms_txt(&self, alias: &str, _content: &str) -> crate::Result<()> {
556 self.saved_txt.borrow_mut().push(alias.to_string());
557 Ok(())
558 }
559
560 fn save_llms_json(&self, alias: &str, _data: &crate::LlmsJson) -> crate::Result<()> {
561 self.saved_json.borrow_mut().push(alias.to_string());
562 Ok(())
563 }
564
565 fn save_metadata(&self, _alias: &str, metadata: &Source) -> crate::Result<()> {
566 self.saved_metadata.borrow_mut().push(metadata.clone());
567 Ok(())
568 }
569
570 fn index_path(&self, alias: &str) -> crate::Result<PathBuf> {
571 self.index_paths
572 .get(alias)
573 .cloned()
574 .ok_or_else(|| crate::Error::NotFound("missing index path".to_string()))
575 }
576
577 fn load_llms_txt(&self, alias: &str) -> crate::Result<String> {
578 self.cached_txt
579 .get(alias)
580 .cloned()
581 .ok_or_else(|| crate::Error::NotFound(format!("missing llms.txt for {alias}")))
582 }
583 }
584
585 #[derive(Default)]
586 struct MockIndexer {
587 indexed: RefCell<Vec<String>>,
588 }
589
590 impl RefreshIndexer for MockIndexer {
591 fn index(
592 &self,
593 alias: &str,
594 _index_path: &std::path::Path,
595 _metrics: PerformanceMetrics,
596 _blocks: &[crate::HeadingBlock],
597 ) -> crate::Result<()> {
598 self.indexed.borrow_mut().push(alias.to_string());
599 Ok(())
600 }
601 }
602
603 fn sample_source() -> Source {
604 Source {
605 url: "https://example.com/llms.txt".to_string(),
606 etag: None,
607 last_modified: None,
608 fetched_at: chrono::Utc::now(),
609 sha256: "abc123".to_string(),
610 variant: crate::SourceVariant::Llms,
611 aliases: Vec::new(),
612 tags: Vec::new(),
613 description: None,
614 category: None,
615 npm_aliases: Vec::new(),
616 github_aliases: Vec::new(),
617 origin: crate::SourceOrigin {
618 manifest: None,
619 source_type: Some(SourceType::Remote {
620 url: "https://example.com/llms.txt".to_string(),
621 }),
622 },
623 filter_non_english: Some(true),
624 }
625 }
626
627 fn sample_payload() -> RefreshPayload {
628 RefreshPayload {
629 content: "# Title\n\nSome content.\n".to_string(),
630 sha256: "abc123".to_string(),
631 etag: None,
632 last_modified: None,
633 }
634 }
635
636 #[test]
637 fn apply_refresh_persists_changes() -> Result<()> {
638 let mut storage = MockStorage::default();
639 storage.metadata.insert("test".to_string(), sample_source());
640 storage
641 .index_paths
642 .insert("test".to_string(), PathBuf::from("index"));
643
644 let indexer = MockIndexer::default();
645 let outcome = apply_refresh(
646 &storage,
647 "test",
648 sample_source(),
649 Vec::new(),
650 sample_payload(),
651 PerformanceMetrics::default(),
652 &indexer,
653 )?;
654
655 assert!(matches!(outcome, RefreshOutcome::Refreshed { .. }));
656 assert_eq!(storage.saved_txt.borrow().len(), 1);
657 assert_eq!(storage.saved_json.borrow().len(), 1);
658 assert_eq!(storage.saved_metadata.borrow().len(), 1);
659 assert_eq!(indexer.indexed.borrow().len(), 1);
660 Ok(())
661 }
662}