1use anyhow::Result;
7use std::path::Path;
8use std::sync::Arc;
9
10use crate::semantic::config;
11use crate::semantic::providers::{self, LlmProvider};
12
13use super::llm_cache::LlmCache;
14
15const CHANGELOG_SYSTEM_PROMPT: &str = "\
17You are a technical writer creating a product-level changelog from recent development activity.
18Your audience is developers and stakeholders who want to understand what changed, why, and what it impacts — NOT the raw commit details.
19
20Guidelines:
21- Group related commits into 3-8 high-level changelog entries.
22- Each entry needs a clear title (what changed) and a 2-4 sentence description (why it matters, what it impacts).
23- Include an approximate date or date range in parentheses after each entry's title, like \"Added search (Apr 10–12)\".
24- Write at a product/feature level, not code level. Say \"Added search to documentation\" not \"Integrated pagefind library into site.rs\".
25- Focus on user-visible impact and system-level consequences.
26- Do NOT include commit hashes, file paths, or diff statistics in your output.
27- Do NOT speculate beyond what the commit messages and file changes reveal.
28
29Output VALID JSON:
30{
31 \"entries\": [
32 {
33 \"title\": \"Short descriptive title (Apr 10–12)\",
34 \"description\": \"2-4 sentences explaining what changed, why, and what it impacts.\"
35 }
36 ]
37}
38
39COMMIT DATA:
40";
41
42const WIKI_SYSTEM_PROMPT: &str = "\
44You are a technical writer creating a module overview for a codebase wiki.
45You may ONLY describe facts present in the STRUCTURAL CONTEXT below.
46
47CRITICAL RULES:
48- NEVER start with 'The X module consists of...', 'This module contains...', or any variant.
49- Your first sentence MUST state what the module DOES or what PURPOSE it serves — infer this from file names, symbol names, and its dependency position.
50- Focus on PURPOSE, RESPONSIBILITIES, and ARCHITECTURAL ROLE — not on listing individual files or classes.
51- Describe the module's architectural role: Is it a hub (many dependents)? A leaf (few dependents)? A bridge between subsystems?
52- Explain how this module fits into the larger system — what it provides to modules that depend on it, and what it consumes from its own dependencies.
53- If the module has high fan-in (many dependents), note that changes to it have wide blast radius.
54- If the module has significantly more or fewer files/lines than average for the codebase, note that.
55- Note complexity: file count, line count, symbol density.
56- Do NOT enumerate specific file names, class names, or function names unless they represent a truly central abstraction that defines the module's identity (e.g., a primary entry point or the single core type). When in doubt, describe WHAT it does rather than naming the file that does it.
57- Vary your sentence structure. Do NOT repeat patterns across modules.
58- Write 4-8 sentences. Be specific about what the module does and its scale, not about which files it contains.
59- Do NOT speculate about design intent or add information not in the context.
60- NEVER leave missing spaces between words. Proofread your output.
61
62STRUCTURAL CONTEXT:
63";
64
65const PROJECT_OVERVIEW_SYSTEM_PROMPT: &str = "\
67You are a technical writer creating a project overview for auto-generated codebase documentation.
68You may ONLY describe facts present in the STRUCTURAL CONTEXT below.
69
70CRITICAL RULES:
71- NEVER start with 'This project consists of...' or 'The codebase is...'
72- Your first sentence MUST describe what this software DOES — its purpose and primary function. Use evidence from module names and symbol names to infer the specific domain (e.g., 'code search' from TrigramIndex, QueryEngine, ParserFactory).
73- Paragraph 1: What it does and how (infer from module names, key symbols, languages used).
74- Paragraph 2: Architecture — how the major modules relate. Which modules are central hubs? What are the natural boundaries? Describe the data flow direction — which modules produce data and which consume it.
75- Paragraph 3: Scale and notable patterns — file/line counts, language mix, dependency health (cycles, hotspots).
76- Write exactly 3-4 paragraphs. Be specific: use module names, file counts, and dependency numbers.
77- Do NOT speculate or add information not in the context.
78- NEVER leave missing spaces between words. Proofread your output.
79
80STRUCTURAL CONTEXT:
81";
82
83const ARCHITECTURE_NARRATIVE_SYSTEM_PROMPT: &str = "\
85You are a technical writer narrating the architecture of a codebase based on its dependency graph.
86You may ONLY describe facts present in the STRUCTURAL CONTEXT below.
87
88CRITICAL RULES:
89- NEVER start with 'The architecture consists of...' or 'This codebase is organized...'
90- Lead with the most connected module and explain WHY it's central (what it provides to others).
91- Describe data flow: which modules are producers (depended-on) vs consumers (depend on many).
92- Identify if the codebase follows a layered pattern (e.g., parsers → models → query engine → CLI) and describe the information flow between layers.
93- Identify natural boundaries: groups of tightly-coupled modules that form subsystems.
94- Call out concerning patterns: circular dependencies, extreme fan-in hotspots, isolated modules.
95- Note peripheral modules: what sits at the edges and what role they serve.
96- Write 3-5 paragraphs. Every claim must reference specific module names and dependency counts.
97- Do NOT speculate about design intent or add information not in the context.
98- NEVER leave missing spaces between words. Proofread your output.
99
100STRUCTURAL CONTEXT:
101";
102
103const ONBOARD_SYSTEM_PROMPT: &str = "\
105You are a technical writer creating a \"Getting Started\" guide for a developer's first day on this codebase.
106You may ONLY describe facts present in the STRUCTURAL CONTEXT below.
107
108CRITICAL RULES:
109- Write 4-5 paragraphs in plain language that a new team member could follow.
110- Paragraph 1: What this project does — its purpose and primary function, in one or two sentences a non-developer could understand.
111- Paragraph 2: How the code is organized — the major directories/modules, what each is responsible for.
112- Paragraph 3: Where to start reading — which entry points to look at first, and why.
113- Paragraph 4: Key patterns and conventions — recurring design patterns, naming conventions, or architectural idioms a newcomer should know.
114- Use specific file and module names from the context.
115- Do NOT speculate or add information not in the context.
116- NEVER leave missing spaces between words. Proofread your output.
117
118STRUCTURAL CONTEXT:
119";
120
121const TIMELINE_SYSTEM_PROMPT: &str = "\
123You are a technical writer summarizing recent development activity for a codebase.
124You may ONLY describe facts present in the STRUCTURAL CONTEXT below.
125
126CRITICAL RULES:
127- Lead with the most active area of the codebase and explain what's happening there.
128- Identify stable modules (few recent changes) vs evolving modules (many recent changes).
129- Flag high-churn files that may warrant attention — files changing very frequently could indicate active development or instability.
130- Note contributor patterns — is this a solo project or a team effort? Who owns which areas?
131- Write 3-5 concise paragraphs with specific numbers, file names, and module names.
132- Do NOT speculate about intent or add information not in the context.
133- NEVER leave missing spaces between words. Proofread your output.
134
135STRUCTURAL CONTEXT:
136";
137
138const CONCEPTS_SYSTEM_PROMPT: &str = "\
145You are documenting a software product's core vocabulary for a non-technical reader.
146
147From the structural evidence below, identify 10-15 HIGH-LEVEL product concepts that someone needs to understand to know what this product DOES and how it works. Concepts are NOUN PHRASES describing capabilities, data ideas, or workflows — NOT specific class names, function names, or file names.
148
149GOOD concept examples: 'Trigram Index', 'Symbol Cache', 'AST Query', 'Dependency Graph', 'LLM Narration', 'Runtime Symbol Detection'
150BAD concept examples: 'SearchResult struct', 'QueryEngine class', 'extract_symbols function'
151
152Rules:
153- Each definition must be 1-3 sentences in plain language a product person could understand.
154- Do NOT start definitions with 'This is a...', 'Represents a...', 'A struct that...'
155- Group concepts into 2-4 categories of your choice (e.g. 'Core Capabilities', 'Data Model', 'Workflows', 'Developer Tools').
156- Anchor each concept to 1-3 module paths from the evidence — these become wiki links.
157- Write exactly ONE intro paragraph (2-3 sentences) describing what kind of vocabulary this page catalogs for this specific product.
158
159Output VALID JSON MATCHING THIS SCHEMA EXACTLY — no markdown fences, no commentary before or after:
160{
161 \"intro\": \"...\",
162 \"concepts\": [
163 {
164 \"name\": \"Concept Name\",
165 \"category\": \"Category Name\",
166 \"definition\": \"1-3 sentence plain-language definition.\",
167 \"related_modules\": [\"src/foo\", \"src/bar\"]
168 }
169 ]
170}
171
172STRUCTURAL EVIDENCE:
173";
174
175const MIN_CONTENT_WORDS: usize = 15;
178
179pub fn create_pulse_provider() -> Result<Box<dyn LlmProvider>> {
185 let semantic_config = config::load_config(Path::new("."))?;
186
187 let (provider, api_key) = match config::get_api_key(&semantic_config.provider) {
189 Ok(key) => (semantic_config.provider.clone(), key),
190 Err(configured_err) => {
191 let fallbacks: &[&str] = &["openrouter", "anthropic", "openai"];
193 let mut found = None;
194 for &candidate in fallbacks {
195 if candidate == semantic_config.provider {
196 continue;
197 }
198 if let Ok(key) = config::get_api_key(candidate) {
199 eprintln!(
200 "Note: no API key for configured provider '{}', using auto-detected '{}'",
201 semantic_config.provider, candidate
202 );
203 found = Some((candidate.to_string(), key));
204 break;
205 }
206 }
207 found.ok_or(configured_err)?
208 }
209 };
210
211 let model = config::resolve_model_for(&provider, semantic_config.model.as_deref(), None);
212
213 let options = config::get_provider_options(&provider);
214
215 providers::create_provider(&provider, api_key, model, options)
216}
217
218pub fn narrate_section(
227 provider: &dyn LlmProvider,
228 system_prompt: &str,
229 structural_context: &str,
230 cache: &LlmCache,
231 snapshot_id: &str,
232 cache_key_suffix: &str,
233) -> Option<String> {
234 let word_count = structural_context.split_whitespace().count();
236 if word_count < MIN_CONTENT_WORDS {
237 eprintln!(" Skipping: {} (too brief, {} words)", cache_key_suffix, word_count);
238 return None;
239 }
240
241 let cache_key = LlmCache::compute_key(snapshot_id, cache_key_suffix, structural_context);
243 match cache.get(&cache_key) {
244 Ok(Some(cached)) => {
245 log::debug!("LLM cache hit for '{}'", cache_key_suffix);
246 eprintln!(" Narrating: {} (cached)", cache_key_suffix);
247 return Some(cached.response);
248 }
249 Ok(None) => {}
250 Err(e) => {
251 log::warn!("Failed to read LLM cache: {}", e);
252 }
253 }
254
255 let prompt = format!("{}{}", system_prompt, structural_context);
257
258 eprintln!(" Narrating: {}...", cache_key_suffix);
259
260 let result = call_llm_sync(provider, &prompt);
262
263 match result {
264 Ok(response) => {
265 let response = postprocess_narration(&response);
266
267 let context_hash = blake3::hash(structural_context.as_bytes()).to_hex().to_string();
269 if let Err(e) = cache.put(&cache_key, &context_hash, &response) {
270 log::warn!("Failed to write LLM cache: {}", e);
271 }
272
273 Some(response)
274 }
275 Err(e) => {
276 log::warn!("LLM narration failed for '{}': {}", cache_key_suffix, e);
277 None
278 }
279 }
280}
281
282pub struct NarrationTask {
284 pub system_prompt: &'static str,
285 pub structural_context: String,
286 pub snapshot_id: String,
287 pub cache_key_suffix: String,
288}
289
290pub struct NarrationResult {
292 pub cache_key_suffix: String,
293 pub response: Option<String>,
294}
295
296pub fn narrate_batch(
301 provider: Arc<dyn LlmProvider>,
302 tasks: Vec<NarrationTask>,
303 cache: &LlmCache,
304 concurrency: usize,
305) -> Vec<NarrationResult> {
306 let total = tasks.len();
307 if total == 0 {
308 return Vec::new();
309 }
310
311 let mut results: Vec<NarrationResult> = Vec::with_capacity(total);
313 let mut pending: Vec<(usize, NarrationTask, String)> = Vec::new(); for task in tasks {
316 let word_count = task.structural_context.split_whitespace().count();
317 if word_count < MIN_CONTENT_WORDS {
318 eprintln!(" Skipping: {} (too brief, {} words)", task.cache_key_suffix, word_count);
319 results.push(NarrationResult {
320 cache_key_suffix: task.cache_key_suffix,
321 response: None,
322 });
323 continue;
324 }
325
326 let cache_key = LlmCache::compute_key(
327 &task.snapshot_id,
328 &task.cache_key_suffix,
329 &task.structural_context,
330 );
331 match cache.get(&cache_key) {
332 Ok(Some(cached)) => {
333 eprintln!(" Narrating: {} (cached)", task.cache_key_suffix);
334 results.push(NarrationResult {
335 cache_key_suffix: task.cache_key_suffix,
336 response: Some(cached.response),
337 });
338 }
339 _ => {
340 let idx = results.len();
341 results.push(NarrationResult {
342 cache_key_suffix: task.cache_key_suffix.clone(),
343 response: None,
344 });
345 pending.push((idx, task, cache_key));
346 }
347 }
348 }
349
350 if pending.is_empty() {
351 return results;
352 }
353
354 let pending_count = pending.len();
355 let effective_concurrency = if concurrency == 0 { pending_count } else { concurrency };
356 eprintln!(
357 " Dispatching {} LLM calls ({} concurrent)...",
358 pending_count, effective_concurrency
359 );
360
361 let cache_dir = cache.cache_dir().to_path_buf();
363
364 let rt = match tokio::runtime::Runtime::new() {
366 Ok(rt) => rt,
367 Err(e) => {
368 log::warn!("Failed to create tokio runtime for batch narration: {}", e);
369 return results;
370 }
371 };
372
373 let async_results = rt.block_on(async {
374 let semaphore = Arc::new(tokio::sync::Semaphore::new(effective_concurrency));
375 let mut join_set = tokio::task::JoinSet::new();
376
377 for (idx, task, cache_key) in pending {
378 let provider = Arc::clone(&provider);
379 let sem = Arc::clone(&semaphore);
380 let cache_dir = cache_dir.clone();
381
382 join_set.spawn(async move {
383 let _permit = sem.acquire().await.expect("semaphore closed");
384 let start = std::time::Instant::now();
385 eprintln!(" Narrating: {}...", task.cache_key_suffix);
386
387 let prompt = format!("{}{}", task.system_prompt, task.structural_context);
388 let result = call_llm_async(&*provider, &prompt).await;
389
390 let response = match result {
391 Ok(raw) => {
392 let response = postprocess_narration(&raw);
393
394 let task_cache = LlmCache::from_dir(cache_dir);
396 let context_hash = blake3::hash(task.structural_context.as_bytes())
397 .to_hex()
398 .to_string();
399 if let Err(e) = task_cache.put(&cache_key, &context_hash, &response) {
400 log::warn!("Failed to write LLM cache for '{}': {}", task.cache_key_suffix, e);
401 }
402
403 eprintln!(
404 " Narrating: {} (done, {:.1}s)",
405 task.cache_key_suffix,
406 start.elapsed().as_secs_f64()
407 );
408 Some(response)
409 }
410 Err(e) => {
411 log::warn!("LLM narration failed for '{}': {}", task.cache_key_suffix, e);
412 eprintln!(
413 " Narrating: {} (failed, {:.1}s)",
414 task.cache_key_suffix,
415 start.elapsed().as_secs_f64()
416 );
417 None
418 }
419 };
420
421 (idx, task.cache_key_suffix, response)
422 });
423 }
424
425 let mut async_results = Vec::new();
426 while let Some(result) = join_set.join_next().await {
427 match result {
428 Ok(r) => async_results.push(r),
429 Err(e) => log::warn!("Narration task panicked: {}", e),
430 }
431 }
432 async_results
433 });
434
435 for (idx, cache_key_suffix, response) in async_results {
437 results[idx] = NarrationResult {
438 cache_key_suffix,
439 response,
440 };
441 }
442
443 results
444}
445
446async fn call_llm_async(provider: &dyn LlmProvider, prompt: &str) -> Result<String> {
448 let max_retries = 2;
449 let mut last_error = None;
450
451 for attempt in 0..=max_retries {
452 if attempt > 0 {
453 log::debug!("Retrying LLM narration (attempt {}/{})", attempt + 1, max_retries + 1);
454 tokio::time::sleep(tokio::time::Duration::from_millis(500 * attempt as u64)).await;
455 }
456
457 match provider.complete(prompt, false).await {
458 Ok(response) => return Ok(response),
459 Err(e) => {
460 log::debug!("LLM call attempt {} failed: {}", attempt + 1, e);
461 last_error = Some(e);
462 }
463 }
464 }
465
466 Err(last_error.unwrap_or_else(|| anyhow::anyhow!("LLM call failed")))
467}
468
469pub fn changelog_system_prompt() -> &'static str {
471 CHANGELOG_SYSTEM_PROMPT
472}
473
474pub fn wiki_system_prompt() -> &'static str {
476 WIKI_SYSTEM_PROMPT
477}
478
479pub fn project_overview_system_prompt() -> &'static str {
481 PROJECT_OVERVIEW_SYSTEM_PROMPT
482}
483
484pub fn architecture_narrative_system_prompt() -> &'static str {
486 ARCHITECTURE_NARRATIVE_SYSTEM_PROMPT
487}
488
489pub fn onboard_system_prompt() -> &'static str {
491 ONBOARD_SYSTEM_PROMPT
492}
493
494pub fn timeline_system_prompt() -> &'static str {
496 TIMELINE_SYSTEM_PROMPT
497}
498
499pub fn concepts_system_prompt() -> &'static str {
501 CONCEPTS_SYSTEM_PROMPT
502}
503
504const CAMEL_CASE_BLOCKLIST: &[&str] = &[
507 "TypeScript", "JavaScript", "CoffeeScript", "ActionScript",
508 "PostgreSQL", "MySQL", "MariaDB", "MongoDB", "CouchDB", "GraphQL",
509 "GitHub", "GitLab", "BitBucket", "WordPress", "PostCSS",
510 "IntelliJ", "WebSocket", "WebAssembly", "DevOps", "DevTools",
511 "DataFrame", "NumPy", "PyTorch", "TensorFlow", "FastAPI",
512 "NextJS", "NestJS", "NodeJS", "ExpressJS", "AngularJS",
513 "iPhone", "iPad", "macOS", "iOS", "FreeBSD", "OpenBSD",
514 "CodePen", "CodeSandbox", "JetBrains", "PhpStorm", "AppKit",
515 "SwiftUI", "UIKit", "CoreData", "MapReduce",
516 "CloudFormation", "CloudFront", "CloudWatch",
517 "RedHat", "OpenShift", "OpenStack",
518 "SourceMap", "AutoComplete", "IntelliSense",
519];
520
521fn postprocess_narration(text: &str) -> String {
523 let mut result = text.trim().to_string();
524
525 let re = regex::Regex::new(r"([a-z])\.([A-Z])").unwrap();
528 result = re.replace_all(&result, "$1. $2").to_string();
529
530 let mut placeholders: Vec<(&str, String)> = Vec::new();
533 for (i, term) in CAMEL_CASE_BLOCKLIST.iter().enumerate() {
534 if result.contains(*term) {
535 let placeholder = format!("\x00KEEP{}\x00", i);
536 result = result.replace(*term, &placeholder);
537 placeholders.push((term, placeholder));
538 }
539 }
540
541 let re = regex::Regex::new(r"([a-z]{3,})([A-Z][a-z]{2,})").unwrap();
543 let parts: Vec<&str> = result.split('`').collect();
544 let mut assembled = String::new();
545 for (i, part) in parts.iter().enumerate() {
546 if i % 2 == 0 {
547 assembled.push_str(&re.replace_all(part, "$1 $2"));
549 } else {
550 assembled.push('`');
552 assembled.push_str(part);
553 assembled.push('`');
554 }
555 }
556 result = assembled;
557
558 for (term, placeholder) in &placeholders {
560 result = result.replace(placeholder, term);
561 }
562
563 while result.contains(" ") {
565 result = result.replace(" ", " ");
566 }
567
568 result
569}
570
571fn call_llm_sync(provider: &dyn LlmProvider, prompt: &str) -> Result<String> {
574 let rt = tokio::runtime::Runtime::new()?;
575 rt.block_on(async {
576 let mut last_error = None;
577 let max_retries = 2;
578
579 for attempt in 0..=max_retries {
580 if attempt > 0 {
581 log::debug!("Retrying LLM narration (attempt {}/{})", attempt + 1, max_retries + 1);
582 tokio::time::sleep(tokio::time::Duration::from_millis(500 * attempt as u64)).await;
583 }
584
585 match provider.complete(prompt, false).await {
586 Ok(response) => return Ok(response),
587 Err(e) => {
588 log::debug!("LLM call attempt {} failed: {}", attempt + 1, e);
589 last_error = Some(e);
590 }
591 }
592 }
593
594 Err(last_error.unwrap_or_else(|| anyhow::anyhow!("LLM call failed")))
595 })
596}
597
598#[cfg(test)]
599mod tests {
600 use super::*;
601
602 #[test]
603 fn test_word_count_sufficient() {
604 let text = "src/parsers/rust.rs has 250 lines and contains extract_symbols fn_name and other important functions used for parsing code";
606 let count = text.split_whitespace().count();
607 assert!(count >= MIN_CONTENT_WORDS, "Word count {} should be >= {}", count, MIN_CONTENT_WORDS);
608 }
609
610 #[test]
611 fn test_word_count_too_brief() {
612 let text = "No data available yet.";
614 let count = text.split_whitespace().count();
615 assert!(count < MIN_CONTENT_WORDS, "Word count {} should be < {}", count, MIN_CONTENT_WORDS);
616 }
617
618 #[test]
619 fn test_word_count_empty() {
620 let count = "".split_whitespace().count();
621 assert!(count < MIN_CONTENT_WORDS);
622 }
623
624 #[test]
625 fn test_word_count_wiki_structural() {
626 let text = "| Language | Files | Lines |\n| --- | --- | --- |\n| Rust | 45 | 12,500 |\n\n**Files:** src/main.rs src/lib.rs src/query/mod.rs src/parsers/rust.rs";
628 let count = text.split_whitespace().count();
629 assert!(count >= MIN_CONTENT_WORDS, "Wiki structural word count {} should be >= {}", count, MIN_CONTENT_WORDS);
630 }
631
632 #[test]
633 fn test_word_count_digest_bootstrap() {
634 let text = "Branch: feature/pulse Commit: abc1234 Files: 120 Edges: 340 Modules: src tests build.rs config.toml main.rs lib.rs";
636 let count = text.split_whitespace().count();
637 assert!(count >= MIN_CONTENT_WORDS, "Digest bootstrap word count {} should be >= {}", count, MIN_CONTENT_WORDS);
638 }
639
640 #[test]
641 fn test_changelog_system_prompt() {
642 assert!(changelog_system_prompt().contains("COMMIT DATA"));
643 }
644
645 #[test]
646 fn test_wiki_system_prompt() {
647 assert!(wiki_system_prompt().contains("STRUCTURAL CONTEXT"));
648 }
649
650 #[test]
651 fn test_postprocess_preserves_proper_nouns() {
652 let input = "The TypeScript module handles JavaScript compilation.";
653 let result = postprocess_narration(input);
654 assert!(result.contains("TypeScript"), "Should preserve TypeScript, got: {}", result);
655 assert!(result.contains("JavaScript"), "Should preserve JavaScript, got: {}", result);
656 }
657
658 #[test]
659 fn test_postprocess_splits_run_on_words() {
660 let input = "The parseModule drives the query engine.";
662 let result = postprocess_narration(input);
663 assert!(result.contains("parse Module"), "Should split run-on camelCase: {}", result);
664 }
665
666 #[test]
667 fn test_postprocess_preserves_backtick_code() {
668 let input = "Uses `TypeScript` and `parseModule` for processing.";
669 let result = postprocess_narration(input);
670 assert!(result.contains("`TypeScript`"), "Should preserve code: {}", result);
671 assert!(result.contains("`parseModule`"), "Should preserve code: {}", result);
672 }
673
674 #[test]
675 fn test_postprocess_fixes_missing_sentence_space() {
676 let input = "First sentence.Second sentence starts here.";
677 let result = postprocess_narration(input);
678 assert!(result.contains(". S"), "Should add space after period: {}", result);
679 }
680
681 #[test]
682 fn test_postprocess_fixes_double_spaces() {
683 let input = "Too many spaces here.";
684 let result = postprocess_narration(input);
685 assert!(!result.contains(" "), "Should remove double spaces: {}", result);
686 }
687}