batuta/oracle/coursera/
reflection.rs

1//! Reflection reading generation
2//!
3//! Extracts themes from transcript via keyword frequency + n-gram analysis,
4//! generates Bloom's taxonomy questions, and attaches arXiv citations.
5
6use std::collections::HashMap;
7
8use super::arxiv_db::ArxivDatabase;
9use super::types::{
10    ArxivCitation, BloomLevel, ReflectionQuestion, ReflectionReading, TranscriptInput,
11};
12
13/// Generate a reflection reading from a transcript.
14///
15/// If `topic_override` is provided, it's used for arXiv lookup instead of
16/// auto-detected themes.
17pub fn generate_reflection(
18    transcript: &TranscriptInput,
19    topic_override: Option<&str>,
20) -> ReflectionReading {
21    let themes = extract_themes(&transcript.text);
22    let questions = generate_bloom_questions(&themes, &transcript.text);
23
24    let db = ArxivDatabase::builtin();
25    let citations = if let Some(topic) = topic_override {
26        find_citations_for_topic(&db, topic)
27    } else {
28        find_citations_for_themes(&db, &themes)
29    };
30
31    ReflectionReading { themes, questions, citations }
32}
33
34/// Render a reflection reading as Markdown.
35pub fn render_reflection_markdown(reading: &ReflectionReading) -> String {
36    let mut md = String::new();
37    md.push_str("# Reflection Reading\n\n");
38
39    // Themes
40    md.push_str("## Key Themes\n\n");
41    if reading.themes.is_empty() {
42        md.push_str("No dominant themes extracted.\n\n");
43    } else {
44        for theme in &reading.themes {
45            md.push_str(&format!("- {theme}\n"));
46        }
47        md.push('\n');
48    }
49
50    // Questions
51    md.push_str("## Reflection Questions\n\n");
52    if reading.questions.is_empty() {
53        md.push_str("No reflection questions generated.\n\n");
54    } else {
55        for (i, q) in reading.questions.iter().enumerate() {
56            md.push_str(&format!("{}. **[{}]** {}\n\n", i + 1, q.thinking_level, q.question));
57        }
58    }
59
60    // Citations
61    md.push_str("## Further Reading\n\n");
62    if reading.citations.is_empty() {
63        md.push_str("No matching citations found.\n");
64    } else {
65        for cite in &reading.citations {
66            md.push_str(&format!(
67                "- {} ({}) — [{}]({}) — *{}*\n",
68                cite.authors, cite.year, cite.title, cite.url, cite.abstract_snippet,
69            ));
70        }
71    }
72    md.push('\n');
73
74    md
75}
76
77// ============================================================================
78// Internal
79// ============================================================================
80
81fn extract_themes(text: &str) -> Vec<String> {
82    let words: Vec<&str> = text
83        .split_whitespace()
84        .map(|w| w.trim_matches(|c: char| !c.is_alphanumeric()))
85        .filter(|w| w.len() >= 3)
86        .collect();
87
88    // Unigram frequency (excluding stop words)
89    let mut freq: HashMap<String, usize> = HashMap::new();
90    for word in &words {
91        let lower = word.to_lowercase();
92        if !is_stop_word(&lower) && lower.len() >= 3 {
93            *freq.entry(lower).or_default() += 1;
94        }
95    }
96
97    // Bigram frequency
98    let mut bigram_freq: HashMap<String, usize> = HashMap::new();
99    for pair in words.windows(2) {
100        let a = pair[0].to_lowercase();
101        let b = pair[1].to_lowercase();
102        if !is_stop_word(&a) && !is_stop_word(&b) && a.len() >= 3 && b.len() >= 3 {
103            let bigram = format!("{a} {b}");
104            *bigram_freq.entry(bigram).or_default() += 1;
105        }
106    }
107
108    // Merge: bigrams that appear >= 2 times, then top unigrams
109    let mut themes: Vec<(String, usize)> =
110        bigram_freq.into_iter().filter(|(_, count)| *count >= 2).collect();
111
112    let top_unigrams: Vec<(String, usize)> = {
113        let mut v: Vec<_> = freq.into_iter().filter(|(_, c)| *c >= 3).collect();
114        v.sort_by(|a, b| b.1.cmp(&a.1));
115        v.into_iter().take(10).collect()
116    };
117
118    themes.extend(top_unigrams);
119    themes.sort_by(|a, b| b.1.cmp(&a.1));
120    themes.dedup_by(|a, b| a.0 == b.0);
121
122    themes.into_iter().take(5).map(|(t, _)| capitalize_theme(&t)).collect()
123}
124
125fn capitalize_theme(s: &str) -> String {
126    let mut chars = s.chars();
127    match chars.next() {
128        None => String::new(),
129        Some(c) => c.to_uppercase().to_string() + chars.as_str(),
130    }
131}
132
133fn generate_bloom_questions(themes: &[String], text: &str) -> Vec<ReflectionQuestion> {
134    if themes.is_empty() {
135        return Vec::new();
136    }
137
138    let text_lower = text.to_lowercase();
139
140    let bloom_templates: Vec<(BloomLevel, &[&str])> = vec![
141        (
142            BloomLevel::Analysis,
143            &[
144                "What are the key components of {theme}, and how do they relate to each other?",
145                "Compare and contrast the approaches to {theme} discussed in the lecture.",
146                "What assumptions underlie the discussion of {theme}?",
147            ],
148        ),
149        (
150            BloomLevel::Synthesis,
151            &[
152                "How would you combine the concepts from {theme} with your existing knowledge to solve a novel problem?",
153                "Design a system that integrates {theme} with a complementary technique.",
154            ],
155        ),
156        (
157            BloomLevel::Evaluation,
158            &[
159                "What are the strengths and limitations of the approach to {theme} presented here?",
160                "Under what conditions would {theme} fail or underperform?",
161            ],
162        ),
163        (
164            BloomLevel::Application,
165            &[
166                "How would you apply {theme} in a production environment?",
167                "Describe a real-world scenario where {theme} would provide significant value.",
168            ],
169        ),
170        (
171            BloomLevel::Creation,
172            &[
173                "Propose an improvement or extension to {theme} that addresses a current limitation.",
174                "Design an experiment to validate the effectiveness of {theme} in your domain.",
175            ],
176        ),
177    ];
178
179    let mut questions = Vec::new();
180
181    for (level, templates) in &bloom_templates {
182        // Pick the best template for each level based on content relevance
183        let theme = select_theme_for_level(themes, &text_lower, level);
184        let template_idx = match level {
185            BloomLevel::Analysis => {
186                usize::from(text_lower.contains("compare") || text_lower.contains("contrast"))
187            }
188            BloomLevel::Evaluation => {
189                usize::from(text_lower.contains("limitation") || text_lower.contains("trade"))
190            }
191            _ => 0,
192        };
193
194        let template = templates[template_idx.min(templates.len() - 1)];
195        let question = template.replace("{theme}", &theme);
196
197        questions.push(ReflectionQuestion { question, thinking_level: *level });
198    }
199
200    questions
201}
202
203fn select_theme_for_level(themes: &[String], _text: &str, _level: &BloomLevel) -> String {
204    // Use the most prominent theme for most levels, rotate for variety
205    let idx = match _level {
206        BloomLevel::Analysis => 0,
207        BloomLevel::Synthesis => themes.len().min(1),
208        BloomLevel::Evaluation => themes.len().min(2) % themes.len(),
209        BloomLevel::Application => 0,
210        BloomLevel::Creation => themes.len().min(1) % themes.len(),
211    };
212    themes.get(idx).cloned().unwrap_or_else(|| "the topic".to_string())
213}
214
215fn find_citations_for_topic(db: &ArxivDatabase, topic: &str) -> Vec<ArxivCitation> {
216    let mut results = db.find_by_topic(topic, 5);
217    if results.len() < 3 {
218        // Try splitting topic into keywords
219        let keywords: Vec<&str> = topic.split_whitespace().collect();
220        let additional = db.find_by_keywords(&keywords, 5 - results.len());
221        for cite in additional {
222            if !results.iter().any(|r| r.arxiv_id == cite.arxiv_id) {
223                results.push(cite);
224            }
225        }
226    }
227    results.truncate(5);
228    results
229}
230
231fn find_citations_for_themes(db: &ArxivDatabase, themes: &[String]) -> Vec<ArxivCitation> {
232    let keywords: Vec<&str> = themes.iter().map(|t| t.as_str()).collect();
233    let mut results = db.find_by_keywords(&keywords, 5);
234
235    // If not enough, try individual themes
236    if results.len() < 3 {
237        for theme in themes {
238            let additional = db.find_by_topic(theme, 2);
239            for cite in additional {
240                if !results.iter().any(|r| r.arxiv_id == cite.arxiv_id) {
241                    results.push(cite);
242                }
243            }
244            if results.len() >= 5 {
245                break;
246            }
247        }
248    }
249
250    results.truncate(5);
251    results
252}
253
254fn is_stop_word(word: &str) -> bool {
255    const STOP: &[&str] = &[
256        "the",
257        "a",
258        "an",
259        "is",
260        "are",
261        "was",
262        "were",
263        "be",
264        "been",
265        "being",
266        "have",
267        "has",
268        "had",
269        "do",
270        "does",
271        "did",
272        "will",
273        "would",
274        "could",
275        "should",
276        "may",
277        "might",
278        "can",
279        "shall",
280        "to",
281        "of",
282        "in",
283        "for",
284        "on",
285        "with",
286        "at",
287        "by",
288        "from",
289        "as",
290        "or",
291        "and",
292        "but",
293        "if",
294        "not",
295        "no",
296        "so",
297        "up",
298        "out",
299        "it",
300        "its",
301        "this",
302        "that",
303        "these",
304        "those",
305        "we",
306        "you",
307        "they",
308        "he",
309        "she",
310        "my",
311        "your",
312        "our",
313        "us",
314        "all",
315        "each",
316        "every",
317        "both",
318        "few",
319        "more",
320        "most",
321        "other",
322        "some",
323        "such",
324        "than",
325        "too",
326        "very",
327        "just",
328        "also",
329        "about",
330        "which",
331        "what",
332        "when",
333        "where",
334        "how",
335        "who",
336        "whom",
337        "why",
338        "into",
339        "through",
340        "during",
341        "before",
342        "after",
343        "above",
344        "below",
345        "between",
346        "same",
347        "different",
348        "then",
349        "there",
350        "here",
351        "new",
352        "old",
353        "many",
354        "much",
355        "own",
356        "only",
357        "well",
358        "use",
359        "used",
360        "using",
361        "like",
362        "one",
363        "two",
364        "get",
365        "make",
366        "way",
367    ];
368    STOP.contains(&word)
369}
370
371#[cfg(test)]
372mod tests {
373    use super::*;
374    use crate::oracle::coursera::types::TranscriptInput;
375
376    fn make_transcript(text: &str) -> TranscriptInput {
377        TranscriptInput {
378            text: text.to_string(),
379            language: "en".to_string(),
380            segments: vec![],
381            source_path: "test.txt".to_string(),
382        }
383    }
384
385    #[test]
386    fn test_generate_reflection_with_topic() {
387        let t = make_transcript(
388            "Machine learning models need careful deployment. MLOps practices help \
389             automate the deployment pipeline. Continuous integration ensures quality. \
390             Testing machine learning models requires specialized approaches.",
391        );
392        let reading = generate_reflection(&t, Some("mlops"));
393        assert!(!reading.citations.is_empty(), "Should find mlops citations");
394        assert!(!reading.questions.is_empty());
395    }
396
397    #[test]
398    fn test_generate_reflection_auto_themes() {
399        let t = make_transcript(
400            "Transformer models use attention mechanisms. Attention allows the model to \
401             focus on relevant parts of the input. The transformer architecture has \
402             revolutionized natural language processing. Attention is computed as a \
403             weighted sum of values. Transformer attention enables parallel computation.",
404        );
405        let reading = generate_reflection(&t, None);
406        assert!(!reading.themes.is_empty());
407        assert!(!reading.questions.is_empty());
408    }
409
410    #[test]
411    fn test_generate_reflection_empty_transcript() {
412        let t = make_transcript("");
413        let reading = generate_reflection(&t, None);
414        assert!(reading.themes.is_empty());
415    }
416
417    #[test]
418    fn test_render_reflection_markdown() {
419        let reading = ReflectionReading {
420            themes: vec!["Machine learning".to_string(), "Deployment".to_string()],
421            questions: vec![ReflectionQuestion {
422                question: "What are the key challenges?".to_string(),
423                thinking_level: BloomLevel::Analysis,
424            }],
425            citations: vec![ArxivCitation {
426                arxiv_id: "1706.03762".to_string(),
427                title: "Attention Is All You Need".to_string(),
428                authors: "Vaswani et al.".to_string(),
429                year: 2017,
430                url: "https://arxiv.org/abs/1706.03762".to_string(),
431                abstract_snippet: "Proposes the Transformer.".to_string(),
432                topics: vec!["transformer".to_string()],
433            }],
434        };
435        let md = render_reflection_markdown(&reading);
436        assert!(md.contains("# Reflection Reading"));
437        assert!(md.contains("Machine learning"));
438        assert!(md.contains("[Analysis]"));
439        assert!(md.contains("Vaswani"));
440        assert!(md.contains("https://arxiv.org"));
441    }
442
443    #[test]
444    fn test_render_reflection_empty() {
445        let reading = ReflectionReading { themes: vec![], questions: vec![], citations: vec![] };
446        let md = render_reflection_markdown(&reading);
447        assert!(md.contains("No dominant themes"));
448        assert!(md.contains("No reflection questions"));
449        assert!(md.contains("No matching citations"));
450    }
451
452    #[test]
453    fn test_extract_themes() {
454        let themes = extract_themes(
455            "Deep learning models require large datasets for training. Deep learning \
456             has transformed computer vision and natural language processing. Training \
457             deep learning models requires significant compute resources. Deep learning \
458             architectures include transformers and convolutional networks.",
459        );
460        assert!(!themes.is_empty());
461        // Should detect "deep learning" as a theme
462        assert!(
463            themes.iter().any(|t| t.to_lowercase().contains("deep") || t.to_lowercase().contains("learning")),
464            "Themes: {:?}",
465            themes
466        );
467    }
468
469    #[test]
470    fn test_bloom_question_levels() {
471        let themes = vec!["Machine learning".to_string()];
472        let questions = generate_bloom_questions(&themes, "Machine learning is important.");
473        assert_eq!(questions.len(), 5);
474
475        let levels: Vec<BloomLevel> = questions.iter().map(|q| q.thinking_level).collect();
476        assert!(levels.contains(&BloomLevel::Analysis));
477        assert!(levels.contains(&BloomLevel::Synthesis));
478        assert!(levels.contains(&BloomLevel::Evaluation));
479        assert!(levels.contains(&BloomLevel::Application));
480        assert!(levels.contains(&BloomLevel::Creation));
481    }
482
483    #[test]
484    fn test_bloom_questions_empty_themes() {
485        let questions = generate_bloom_questions(&[], "Some text");
486        assert!(questions.is_empty());
487    }
488
489    #[test]
490    fn test_find_citations_for_topic() {
491        let db = ArxivDatabase::builtin();
492        let results = find_citations_for_topic(&db, "transformer");
493        assert!(!results.is_empty());
494        assert!(results.len() <= 5);
495    }
496
497    #[test]
498    fn test_find_citations_for_themes() {
499        let db = ArxivDatabase::builtin();
500        let themes = vec!["mlops".to_string(), "deployment".to_string()];
501        let results = find_citations_for_themes(&db, &themes);
502        assert!(!results.is_empty());
503    }
504
505    #[test]
506    fn test_bloom_compare_contrast_template() {
507        // Triggers line 202: template_idx = 1 when text contains "compare"
508        let themes = vec!["Neural networks".to_string()];
509        let questions =
510            generate_bloom_questions(&themes, "We compare different neural network architectures.");
511        let analysis_q = questions
512            .iter()
513            .find(|q| q.thinking_level == BloomLevel::Analysis)
514            .expect("unexpected failure");
515        assert!(
516            analysis_q.question.contains("Compare and contrast"),
517            "Got: {}",
518            analysis_q.question
519        );
520    }
521
522    #[test]
523    fn test_bloom_limitation_template() {
524        // Triggers line 209: template_idx = 1 when text contains "limitation"
525        let themes = vec!["Attention".to_string()];
526        let questions =
527            generate_bloom_questions(&themes, "A key limitation of attention is memory cost.");
528        let eval_q = questions
529            .iter()
530            .find(|q| q.thinking_level == BloomLevel::Evaluation)
531            .expect("unexpected failure");
532        assert!(
533            eval_q.question.contains("conditions") || eval_q.question.contains("fail"),
534            "Got: {}",
535            eval_q.question
536        );
537    }
538
539    #[test]
540    fn test_capitalize_theme_empty() {
541        // Triggers line 143: empty string capitalization
542        assert_eq!(capitalize_theme(""), "");
543    }
544
545    #[test]
546    fn test_find_citations_for_topic_sparse() {
547        // Triggers lines 248-254: keyword fallback when topic yields < 3 results
548        let db = ArxivDatabase::builtin();
549        let results = find_citations_for_topic(&db, "obscure quantum federated distillation");
550        // Should still attempt keyword split fallback
551        assert!(results.len() <= 5);
552    }
553
554    #[test]
555    fn test_find_citations_for_themes_individual_fallback() {
556        // Triggers lines 267-275: individual theme lookup when keyword search yields < 3
557        let db = ArxivDatabase::builtin();
558        let themes =
559            vec!["xyznonexistent".to_string(), "transformer".to_string(), "attention".to_string()];
560        let results = find_citations_for_themes(&db, &themes);
561        // The individual theme "transformer" should produce results
562        assert!(!results.is_empty());
563    }
564}
batuta/oracle/coursera/reflection.rs

batuta/oracle/coursera/
reflection.rs