Skip to main content

batuta/oracle/coursera/
key_concepts.rs

1//! Key Concepts reading generation
2//!
3//! Extracts technical concepts with definitions, categorizes them,
4//! and inserts code examples when transcript mentions programming patterns.
5
6use std::collections::HashMap;
7
8use super::types::{CodeExample, Concept, ConceptCategory, KeyConceptsReading, TranscriptInput};
9use super::vocabulary;
10
11/// Generate a key concepts reading from a transcript.
12pub fn generate_key_concepts(transcript: &TranscriptInput) -> KeyConceptsReading {
13    let concepts = extract_concepts(transcript);
14    let code_examples = extract_code_examples(transcript, &concepts);
15
16    KeyConceptsReading { concepts, code_examples }
17}
18
19/// Render key concepts as Markdown.
20pub fn render_key_concepts_markdown(reading: &KeyConceptsReading) -> String {
21    let mut md = String::new();
22    md.push_str("# Key Concepts\n\n");
23
24    if reading.concepts.is_empty() {
25        md.push_str("No key concepts extracted from this transcript.\n");
26        return md;
27    }
28
29    // Group by category
30    let mut by_category: HashMap<&str, Vec<&Concept>> = HashMap::new();
31    for concept in &reading.concepts {
32        by_category.entry(concept.category.as_str()).or_default().push(concept);
33    }
34
35    let mut categories: Vec<&&str> = by_category.keys().collect();
36    categories.sort();
37
38    for cat in categories {
39        let cat_concepts = &by_category[*cat];
40        md.push_str(&format!("## {}\n\n", cat));
41        md.push_str("| Concept | Definition |\n");
42        md.push_str("|---------|------------|\n");
43
44        for concept in cat_concepts {
45            md.push_str(&format!("| **{}** | {} |\n", concept.term, concept.definition));
46        }
47        md.push('\n');
48
49        // Add context quotes
50        for concept in cat_concepts {
51            if !concept.context.is_empty() {
52                md.push_str(&format!("> *\"{}\"*\n\n", concept.context));
53            }
54        }
55    }
56
57    // Code examples
58    if !reading.code_examples.is_empty() {
59        md.push_str("## Code Examples\n\n");
60        for example in &reading.code_examples {
61            md.push_str(&format!(
62                "### {} ({})\n\n```{}\n{}\n```\n\n",
63                example.related_concept, example.language, example.language, example.code
64            ));
65        }
66    }
67
68    md
69}
70
71// ============================================================================
72// Internal
73// ============================================================================
74
75fn extract_concepts(transcript: &TranscriptInput) -> Vec<Concept> {
76    let vocab = vocabulary::extract_vocabulary(std::slice::from_ref(transcript));
77    let sentences = split_sentences(&transcript.text);
78
79    let mut concepts: Vec<Concept> = Vec::new();
80    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
81
82    for entry in &vocab {
83        let lower = entry.term.to_lowercase();
84        if seen.contains(&lower) {
85            continue;
86        }
87        seen.insert(lower.clone());
88
89        let context = find_best_context(&sentences, &entry.term);
90        let definition = if entry.definition.len() > 10 {
91            entry.definition.clone()
92        } else {
93            derive_concept_definition(&sentences, &entry.term)
94        };
95
96        concepts.push(Concept {
97            term: entry.term.clone(),
98            definition,
99            context,
100            category: entry.category,
101        });
102    }
103
104    // Also extract definition patterns not caught by vocab extraction
105    for sentence in &sentences {
106        if let Some(concept) = try_extract_definition_pattern(sentence) {
107            let lower = concept.term.to_lowercase();
108            if !seen.contains(&lower) {
109                seen.insert(lower);
110                concepts.push(concept);
111            }
112        }
113    }
114
115    // Limit to top 15 concepts
116    concepts.truncate(15);
117    concepts
118}
119
120fn find_best_context(sentences: &[String], term: &str) -> String {
121    let lower_term = term.to_lowercase();
122
123    // Prefer sentences with definitional patterns
124    for s in sentences {
125        let lower = s.to_lowercase();
126        if lower.contains(&lower_term) && (lower.contains(" is ") || lower.contains(" are ")) {
127            return truncate(s, 150);
128        }
129    }
130
131    // Fall back to first mention
132    for s in sentences {
133        if s.to_lowercase().contains(&lower_term) {
134            return truncate(s, 150);
135        }
136    }
137
138    String::new()
139}
140
141fn derive_concept_definition(sentences: &[String], term: &str) -> String {
142    let lower_term = term.to_lowercase();
143
144    for sentence in sentences {
145        let lower = sentence.to_lowercase();
146
147        // "X is ..."
148        if let Some(pos) = lower.find(&format!("{} is ", lower_term)) {
149            let start = pos + lower_term.len() + 4;
150            if let Some(def) = sentence.get(start..) {
151                let end = def.find('.').unwrap_or(def.len()).min(120);
152                return capitalize_first(safe_truncate_bytes(def, end).trim());
153            }
154        }
155
156        // "X, also known as ..."
157        if let Some(pos) = lower.find(&format!("{}, also known as ", lower_term)) {
158            let start = pos + lower_term.len() + 17;
159            if let Some(def) = sentence.get(start..) {
160                let end = def.find('.').unwrap_or(def.len()).min(120);
161                return format!("Also known as {}", safe_truncate_bytes(def, end).trim());
162            }
163        }
164    }
165
166    format!("Technical concept: {term}")
167}
168
169fn try_extract_definition_pattern(sentence: &str) -> Option<Concept> {
170    let patterns = [" is a ", " is an ", " is the ", " refers to "];
171    let lower = sentence.to_lowercase();
172
173    patterns.iter().find_map(|pat| try_match_definition(sentence, &lower, pat))
174}
175
176fn try_match_definition(sentence: &str, lower: &str, pat: &str) -> Option<Concept> {
177    let pos = lower.find(pat)?;
178
179    let term = extract_term_before(sentence, pos);
180    if term.len() < 3 || term.chars().next().is_some_and(|c| c.is_lowercase()) {
181        return None;
182    }
183
184    let def_start = pos + pat.len();
185    let definition = sentence.get(def_start..)?;
186    let end = definition.find('.').unwrap_or(definition.len()).min(120);
187    let definition = capitalize_first(safe_truncate_bytes(definition, end).trim());
188
189    if definition.len() < 5 {
190        return None;
191    }
192
193    Some(Concept {
194        term: term.trim().to_string(),
195        definition,
196        context: truncate(sentence, 150),
197        category: ConceptCategory::General,
198    })
199}
200
201fn extract_term_before(sentence: &str, pos: usize) -> String {
202    sentence
203        .get(..pos)
204        .unwrap_or("")
205        .split_whitespace()
206        .rev()
207        .take(3)
208        .collect::<Vec<_>>()
209        .into_iter()
210        .rev()
211        .collect::<Vec<_>>()
212        .join(" ")
213}
214
215fn extract_code_examples(transcript: &TranscriptInput, concepts: &[Concept]) -> Vec<CodeExample> {
216    let mut examples = Vec::new();
217    let text_lower = transcript.text.to_lowercase();
218
219    extract_bash_examples(&text_lower, concepts, &mut examples);
220    extract_language_example(
221        &text_lower,
222        concepts,
223        &mut examples,
224        &["python", "import", "pytorch"],
225        "python",
226        &["python", "pytorch", "model"],
227        "Python",
228        "import torch\nmodel = torch.load(\"model.pt\")\noutput = model(input_tensor)",
229    );
230    extract_language_example(
231        &text_lower,
232        concepts,
233        &mut examples,
234        &["rust", "cargo", "trueno"],
235        "rust",
236        &["rust", "cargo", "trueno"],
237        "Rust",
238        "use trueno::Tensor;\nlet data = Tensor::from_slice(&[1.0, 2.0, 3.0]);\nlet result = data.matmul(&weights)?;",
239    );
240
241    examples.truncate(5);
242    examples
243}
244
245fn extract_bash_examples(text_lower: &str, concepts: &[Concept], examples: &mut Vec<CodeExample>) {
246    let bash_patterns: &[(&str, &str)] = &[
247        ("docker", "docker run -p 8080:8080 model-server"),
248        ("pip", "pip install torch transformers"),
249        ("cargo", "cargo build --release"),
250        ("kubectl", "kubectl apply -f deployment.yaml"),
251        ("curl", "curl -X POST http://localhost:8080/predict -d '{\"input\": \"text\"}'"),
252        ("git", "git clone https://github.com/org/repo.git"),
253    ];
254
255    for (keyword, code) in bash_patterns {
256        if text_lower.contains(keyword) {
257            let related = find_related_concept(concepts, &[keyword])
258                .unwrap_or_else(|| (*keyword).to_string());
259            examples.push(CodeExample {
260                language: "bash".to_string(),
261                code: (*code).to_string(),
262                related_concept: related,
263            });
264        }
265    }
266}
267
268#[allow(clippy::too_many_arguments)]
269fn extract_language_example(
270    text_lower: &str,
271    concepts: &[Concept],
272    examples: &mut Vec<CodeExample>,
273    triggers: &[&str],
274    language: &str,
275    concept_keywords: &[&str],
276    fallback_name: &str,
277    code: &str,
278) {
279    if triggers.iter().any(|t| text_lower.contains(t)) {
280        let related = find_related_concept(concepts, concept_keywords)
281            .unwrap_or_else(|| fallback_name.to_string());
282        examples.push(CodeExample {
283            language: language.to_string(),
284            code: code.to_string(),
285            related_concept: related,
286        });
287    }
288}
289
290fn find_related_concept(concepts: &[Concept], keywords: &[&str]) -> Option<String> {
291    concepts
292        .iter()
293        .find(|c| {
294            let cl = c.term.to_lowercase();
295            keywords.iter().any(|kw| cl.contains(kw))
296        })
297        .map(|c| c.term.clone())
298}
299
300fn split_sentences(text: &str) -> Vec<String> {
301    let mut sentences = Vec::new();
302    let mut current = String::new();
303
304    for ch in text.chars() {
305        current.push(ch);
306        if matches!(ch, '.' | '!' | '?') {
307            let trimmed = current.trim().to_string();
308            if !trimmed.is_empty() {
309                sentences.push(trimmed);
310            }
311            current.clear();
312        }
313    }
314
315    let trimmed = current.trim().to_string();
316    if !trimmed.is_empty() {
317        sentences.push(trimmed);
318    }
319
320    sentences
321}
322
323fn truncate(s: &str, max: usize) -> String {
324    if s.len() <= max {
325        s.to_string()
326    } else {
327        format!("{}...", safe_truncate_bytes(s, max))
328    }
329}
330
331/// Truncate a string at the nearest char boundary at or before `max_bytes`.
332fn safe_truncate_bytes(s: &str, max_bytes: usize) -> &str {
333    if max_bytes >= s.len() {
334        return s;
335    }
336    let mut end = max_bytes;
337    while end > 0 && !s.is_char_boundary(end) {
338        end -= 1;
339    }
340    &s[..end]
341}
342
343fn capitalize_first(s: &str) -> String {
344    let mut chars = s.chars();
345    match chars.next() {
346        None => String::new(),
347        Some(c) => c.to_uppercase().to_string() + chars.as_str(),
348    }
349}
350
351#[cfg(test)]
352mod tests {
353    use super::*;
354    use crate::oracle::coursera::types::TranscriptSegment;
355
356    fn make_transcript(text: &str) -> TranscriptInput {
357        TranscriptInput {
358            text: text.to_string(),
359            language: "en".to_string(),
360            segments: vec![],
361            source_path: "test.txt".to_string(),
362        }
363    }
364
365    #[test]
366    fn test_generate_key_concepts() {
367        let t = make_transcript(
368            "MLOps is the practice of deploying ML models to production. \
369             MLOps combines DevOps and machine learning workflows. \
370             CI/CD pipelines automate the deployment process. \
371             CI/CD is essential for reliable software delivery. \
372             GPU acceleration speeds up model inference. \
373             GPU computing enables parallel processing.",
374        );
375        let reading = generate_key_concepts(&t);
376        assert!(!reading.concepts.is_empty());
377    }
378
379    #[test]
380    fn test_generate_key_concepts_empty() {
381        let t = make_transcript("The cat sat on the mat.");
382        let reading = generate_key_concepts(&t);
383        assert!(reading.concepts.is_empty());
384    }
385
386    #[test]
387    fn test_render_key_concepts_markdown() {
388        let reading = KeyConceptsReading {
389            concepts: vec![Concept {
390                term: "MLOps".to_string(),
391                definition: "Machine Learning Operations".to_string(),
392                context: "MLOps combines ML and DevOps.".to_string(),
393                category: ConceptCategory::Pattern,
394            }],
395            code_examples: vec![CodeExample {
396                language: "bash".to_string(),
397                code: "docker run app".to_string(),
398                related_concept: "Docker".to_string(),
399            }],
400        };
401        let md = render_key_concepts_markdown(&reading);
402        assert!(md.contains("# Key Concepts"));
403        assert!(md.contains("MLOps"));
404        assert!(md.contains("## Code Examples"));
405        assert!(md.contains("```bash"));
406    }
407
408    #[test]
409    fn test_render_key_concepts_empty() {
410        let reading = KeyConceptsReading { concepts: vec![], code_examples: vec![] };
411        let md = render_key_concepts_markdown(&reading);
412        assert!(md.contains("No key concepts extracted"));
413    }
414
415    #[test]
416    fn test_extract_code_examples_bash() {
417        let t = make_transcript(
418            "We use docker to deploy our models. Docker containers are lightweight.",
419        );
420        let concepts = vec![Concept {
421            term: "Docker".to_string(),
422            definition: "Container runtime".to_string(),
423            context: "".to_string(),
424            category: ConceptCategory::Tool,
425        }];
426        let examples = extract_code_examples(&t, &concepts);
427        assert!(!examples.is_empty());
428        assert_eq!(examples[0].language, "bash");
429    }
430
431    #[test]
432    fn test_extract_code_examples_python() {
433        let t = make_transcript("Python and PyTorch are used for model training. Python scripts handle data processing.");
434        let concepts = vec![];
435        let examples = extract_code_examples(&t, &concepts);
436        let python_example = examples.iter().find(|e| e.language == "python");
437        assert!(python_example.is_some());
438    }
439
440    #[test]
441    fn test_extract_code_examples_rust() {
442        let t = make_transcript(
443            "Rust and cargo are used for high-performance computing. Rust provides memory safety.",
444        );
445        let concepts = vec![];
446        let examples = extract_code_examples(&t, &concepts);
447        let rust_example = examples.iter().find(|e| e.language == "rust");
448        assert!(rust_example.is_some());
449    }
450
451    #[test]
452    fn test_try_extract_definition_pattern() {
453        let result = try_extract_definition_pattern(
454            "Batch Normalization is a technique that normalizes layer inputs.",
455        );
456        assert!(result.is_some());
457        let concept = result.expect("operation failed");
458        assert!(concept.term.contains("Normalization"));
459    }
460
461    #[test]
462    fn test_duplicate_terms_deduplicated() {
463        // Triggers line 94: duplicate term skip via `seen`
464        let t = make_transcript(
465            "MLOps is the practice of deploying ML models. MLOps automates deployment. \
466             MLOps combines DevOps and ML. MLOps pipelines handle continuous delivery. \
467             MLOps teams build reliable systems.",
468        );
469        let reading = generate_key_concepts(&t);
470        let mlops_count =
471            reading.concepts.iter().filter(|c| c.term.to_lowercase() == "mlops").count();
472        assert!(mlops_count <= 1, "MLOps should appear at most once");
473    }
474
475    #[test]
476    fn test_derive_concept_definition_is_pattern() {
477        // Triggers derive_concept_definition "X is ..." pattern (lines 157-163)
478        let sentences =
479            vec!["Kubernetes is an open-source container orchestration platform.".to_string()];
480        let def = super::derive_concept_definition(&sentences, "Kubernetes");
481        assert!(def.contains("open-source") || def.contains("container"), "Got: {def}");
482    }
483
484    #[test]
485    fn test_derive_concept_definition_also_known_as() {
486        // Triggers "also known as" pattern (lines 166-172)
487        let sentences = vec!["K8s, also known as Kubernetes container orchestration.".to_string()];
488        let def = super::derive_concept_definition(&sentences, "K8s");
489        assert!(def.starts_with("Also known as"), "Got: {def}");
490    }
491
492    #[test]
493    fn test_derive_concept_definition_fallback() {
494        // Triggers fallback path (line 175) when no definition pattern matches
495        let sentences = vec!["Random text about something.".to_string()];
496        let def = super::derive_concept_definition(&sentences, "QUIC");
497        assert!(def.contains("Technical concept: QUIC"), "Got: {def}");
498    }
499
500    #[test]
501    fn test_find_best_context_no_match() {
502        // Triggers line 147: no matching sentence returns empty string
503        let sentences = vec!["The cat sat on the mat.".to_string()];
504        let ctx = super::find_best_context(&sentences, "kubernetes");
505        assert!(ctx.is_empty());
506    }
507
508    #[test]
509    fn test_try_match_definition_short_term_rejected() {
510        // Triggers line 192: term too short (< 3 chars)
511        let result = super::try_match_definition("It is a test.", "it is a test.", " is a ");
512        assert!(result.is_none());
513    }
514
515    #[test]
516    fn test_try_match_definition_short_definition_rejected() {
517        // Triggers line 200-201: definition too short (< 5 chars)
518        let result = try_extract_definition_pattern("BigThing is a ok.");
519        assert!(result.is_none());
520    }
521
522    #[test]
523    fn test_truncate_long_string() {
524        // Triggers line 341: format!("{}...", safe_truncate_bytes(s, max))
525        let long = "a".repeat(200);
526        let result = super::truncate(&long, 50);
527        assert!(result.ends_with("..."));
528        assert!(result.len() <= 54); // 50 + "..."
529    }
530
531    #[test]
532    fn test_safe_truncate_bytes_multibyte() {
533        // Triggers lines 348-353: char boundary adjustment
534        let s = "héllo wörld";
535        let truncated = super::safe_truncate_bytes(s, 3);
536        // Should not panic; 'é' is 2 bytes, so position 3 may fall mid-char
537        assert!(!truncated.is_empty());
538        assert!(s.is_char_boundary(truncated.len()));
539    }
540
541    #[test]
542    fn test_capitalize_first_empty() {
543        // Triggers line 360: empty string case
544        assert_eq!(super::capitalize_first(""), "");
545    }
546
547    #[test]
548    fn test_split_sentences_trailing_text() {
549        // Triggers lines 329-331: trailing text without terminal punctuation
550        let sentences = super::split_sentences("Hello world. This has no period");
551        assert_eq!(sentences.len(), 2);
552        assert_eq!(sentences[1], "This has no period");
553    }
554
555    #[test]
556    fn test_definition_pattern_refers_to() {
557        // Triggers " refers to " pattern in try_extract_definition_pattern
558        let result =
559            try_extract_definition_pattern("MLOps refers to the practice of operationalizing ML.");
560        assert!(result.is_some());
561        let concept = result.expect("operation failed");
562        assert!(
563            concept.definition.contains("practice")
564                || concept.definition.contains("operationalizing")
565        );
566    }
567
568    #[test]
569    fn test_concepts_with_segments() {
570        let t = TranscriptInput {
571            text: "API endpoints serve ML predictions. The API handles inference. \
572                   GPU acceleration is critical. GPU kernels run fast."
573                .to_string(),
574            language: "en".to_string(),
575            segments: vec![TranscriptSegment {
576                start: 0.0,
577                end: 10.0,
578                text: "API endpoints serve ML predictions.".to_string(),
579            }],
580            source_path: "test.json".to_string(),
581        };
582        let reading = generate_key_concepts(&t);
583        assert!(!reading.concepts.is_empty());
584    }
585}