1use std::collections::HashMap;
7
8use super::types::{CodeExample, Concept, ConceptCategory, KeyConceptsReading, TranscriptInput};
9use super::vocabulary;
10
11pub fn generate_key_concepts(transcript: &TranscriptInput) -> KeyConceptsReading {
13 let concepts = extract_concepts(transcript);
14 let code_examples = extract_code_examples(transcript, &concepts);
15
16 KeyConceptsReading { concepts, code_examples }
17}
18
19pub fn render_key_concepts_markdown(reading: &KeyConceptsReading) -> String {
21 let mut md = String::new();
22 md.push_str("# Key Concepts\n\n");
23
24 if reading.concepts.is_empty() {
25 md.push_str("No key concepts extracted from this transcript.\n");
26 return md;
27 }
28
29 let mut by_category: HashMap<&str, Vec<&Concept>> = HashMap::new();
31 for concept in &reading.concepts {
32 by_category.entry(concept.category.as_str()).or_default().push(concept);
33 }
34
35 let mut categories: Vec<&&str> = by_category.keys().collect();
36 categories.sort();
37
38 for cat in categories {
39 let cat_concepts = &by_category[*cat];
40 md.push_str(&format!("## {}\n\n", cat));
41 md.push_str("| Concept | Definition |\n");
42 md.push_str("|---------|------------|\n");
43
44 for concept in cat_concepts {
45 md.push_str(&format!("| **{}** | {} |\n", concept.term, concept.definition));
46 }
47 md.push('\n');
48
49 for concept in cat_concepts {
51 if !concept.context.is_empty() {
52 md.push_str(&format!("> *\"{}\"*\n\n", concept.context));
53 }
54 }
55 }
56
57 if !reading.code_examples.is_empty() {
59 md.push_str("## Code Examples\n\n");
60 for example in &reading.code_examples {
61 md.push_str(&format!(
62 "### {} ({})\n\n```{}\n{}\n```\n\n",
63 example.related_concept, example.language, example.language, example.code
64 ));
65 }
66 }
67
68 md
69}
70
71fn extract_concepts(transcript: &TranscriptInput) -> Vec<Concept> {
76 let vocab = vocabulary::extract_vocabulary(std::slice::from_ref(transcript));
77 let sentences = split_sentences(&transcript.text);
78
79 let mut concepts: Vec<Concept> = Vec::new();
80 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
81
82 for entry in &vocab {
83 let lower = entry.term.to_lowercase();
84 if seen.contains(&lower) {
85 continue;
86 }
87 seen.insert(lower.clone());
88
89 let context = find_best_context(&sentences, &entry.term);
90 let definition = if entry.definition.len() > 10 {
91 entry.definition.clone()
92 } else {
93 derive_concept_definition(&sentences, &entry.term)
94 };
95
96 concepts.push(Concept {
97 term: entry.term.clone(),
98 definition,
99 context,
100 category: entry.category,
101 });
102 }
103
104 for sentence in &sentences {
106 if let Some(concept) = try_extract_definition_pattern(sentence) {
107 let lower = concept.term.to_lowercase();
108 if !seen.contains(&lower) {
109 seen.insert(lower);
110 concepts.push(concept);
111 }
112 }
113 }
114
115 concepts.truncate(15);
117 concepts
118}
119
120fn find_best_context(sentences: &[String], term: &str) -> String {
121 let lower_term = term.to_lowercase();
122
123 for s in sentences {
125 let lower = s.to_lowercase();
126 if lower.contains(&lower_term) && (lower.contains(" is ") || lower.contains(" are ")) {
127 return truncate(s, 150);
128 }
129 }
130
131 for s in sentences {
133 if s.to_lowercase().contains(&lower_term) {
134 return truncate(s, 150);
135 }
136 }
137
138 String::new()
139}
140
141fn derive_concept_definition(sentences: &[String], term: &str) -> String {
142 let lower_term = term.to_lowercase();
143
144 for sentence in sentences {
145 let lower = sentence.to_lowercase();
146
147 if let Some(pos) = lower.find(&format!("{} is ", lower_term)) {
149 let start = pos + lower_term.len() + 4;
150 if let Some(def) = sentence.get(start..) {
151 let end = def.find('.').unwrap_or(def.len()).min(120);
152 return capitalize_first(safe_truncate_bytes(def, end).trim());
153 }
154 }
155
156 if let Some(pos) = lower.find(&format!("{}, also known as ", lower_term)) {
158 let start = pos + lower_term.len() + 17;
159 if let Some(def) = sentence.get(start..) {
160 let end = def.find('.').unwrap_or(def.len()).min(120);
161 return format!("Also known as {}", safe_truncate_bytes(def, end).trim());
162 }
163 }
164 }
165
166 format!("Technical concept: {term}")
167}
168
169fn try_extract_definition_pattern(sentence: &str) -> Option<Concept> {
170 let patterns = [" is a ", " is an ", " is the ", " refers to "];
171 let lower = sentence.to_lowercase();
172
173 patterns.iter().find_map(|pat| try_match_definition(sentence, &lower, pat))
174}
175
176fn try_match_definition(sentence: &str, lower: &str, pat: &str) -> Option<Concept> {
177 let pos = lower.find(pat)?;
178
179 let term = extract_term_before(sentence, pos);
180 if term.len() < 3 || term.chars().next().is_some_and(|c| c.is_lowercase()) {
181 return None;
182 }
183
184 let def_start = pos + pat.len();
185 let definition = sentence.get(def_start..)?;
186 let end = definition.find('.').unwrap_or(definition.len()).min(120);
187 let definition = capitalize_first(safe_truncate_bytes(definition, end).trim());
188
189 if definition.len() < 5 {
190 return None;
191 }
192
193 Some(Concept {
194 term: term.trim().to_string(),
195 definition,
196 context: truncate(sentence, 150),
197 category: ConceptCategory::General,
198 })
199}
200
201fn extract_term_before(sentence: &str, pos: usize) -> String {
202 sentence
203 .get(..pos)
204 .unwrap_or("")
205 .split_whitespace()
206 .rev()
207 .take(3)
208 .collect::<Vec<_>>()
209 .into_iter()
210 .rev()
211 .collect::<Vec<_>>()
212 .join(" ")
213}
214
215fn extract_code_examples(transcript: &TranscriptInput, concepts: &[Concept]) -> Vec<CodeExample> {
216 let mut examples = Vec::new();
217 let text_lower = transcript.text.to_lowercase();
218
219 extract_bash_examples(&text_lower, concepts, &mut examples);
220 extract_language_example(
221 &text_lower,
222 concepts,
223 &mut examples,
224 &["python", "import", "pytorch"],
225 "python",
226 &["python", "pytorch", "model"],
227 "Python",
228 "import torch\nmodel = torch.load(\"model.pt\")\noutput = model(input_tensor)",
229 );
230 extract_language_example(
231 &text_lower,
232 concepts,
233 &mut examples,
234 &["rust", "cargo", "trueno"],
235 "rust",
236 &["rust", "cargo", "trueno"],
237 "Rust",
238 "use trueno::Tensor;\nlet data = Tensor::from_slice(&[1.0, 2.0, 3.0]);\nlet result = data.matmul(&weights)?;",
239 );
240
241 examples.truncate(5);
242 examples
243}
244
245fn extract_bash_examples(text_lower: &str, concepts: &[Concept], examples: &mut Vec<CodeExample>) {
246 let bash_patterns: &[(&str, &str)] = &[
247 ("docker", "docker run -p 8080:8080 model-server"),
248 ("pip", "pip install torch transformers"),
249 ("cargo", "cargo build --release"),
250 ("kubectl", "kubectl apply -f deployment.yaml"),
251 ("curl", "curl -X POST http://localhost:8080/predict -d '{\"input\": \"text\"}'"),
252 ("git", "git clone https://github.com/org/repo.git"),
253 ];
254
255 for (keyword, code) in bash_patterns {
256 if text_lower.contains(keyword) {
257 let related = find_related_concept(concepts, &[keyword])
258 .unwrap_or_else(|| (*keyword).to_string());
259 examples.push(CodeExample {
260 language: "bash".to_string(),
261 code: (*code).to_string(),
262 related_concept: related,
263 });
264 }
265 }
266}
267
268#[allow(clippy::too_many_arguments)]
269fn extract_language_example(
270 text_lower: &str,
271 concepts: &[Concept],
272 examples: &mut Vec<CodeExample>,
273 triggers: &[&str],
274 language: &str,
275 concept_keywords: &[&str],
276 fallback_name: &str,
277 code: &str,
278) {
279 if triggers.iter().any(|t| text_lower.contains(t)) {
280 let related = find_related_concept(concepts, concept_keywords)
281 .unwrap_or_else(|| fallback_name.to_string());
282 examples.push(CodeExample {
283 language: language.to_string(),
284 code: code.to_string(),
285 related_concept: related,
286 });
287 }
288}
289
290fn find_related_concept(concepts: &[Concept], keywords: &[&str]) -> Option<String> {
291 concepts
292 .iter()
293 .find(|c| {
294 let cl = c.term.to_lowercase();
295 keywords.iter().any(|kw| cl.contains(kw))
296 })
297 .map(|c| c.term.clone())
298}
299
300fn split_sentences(text: &str) -> Vec<String> {
301 let mut sentences = Vec::new();
302 let mut current = String::new();
303
304 for ch in text.chars() {
305 current.push(ch);
306 if matches!(ch, '.' | '!' | '?') {
307 let trimmed = current.trim().to_string();
308 if !trimmed.is_empty() {
309 sentences.push(trimmed);
310 }
311 current.clear();
312 }
313 }
314
315 let trimmed = current.trim().to_string();
316 if !trimmed.is_empty() {
317 sentences.push(trimmed);
318 }
319
320 sentences
321}
322
323fn truncate(s: &str, max: usize) -> String {
324 if s.len() <= max {
325 s.to_string()
326 } else {
327 format!("{}...", safe_truncate_bytes(s, max))
328 }
329}
330
331fn safe_truncate_bytes(s: &str, max_bytes: usize) -> &str {
333 if max_bytes >= s.len() {
334 return s;
335 }
336 let mut end = max_bytes;
337 while end > 0 && !s.is_char_boundary(end) {
338 end -= 1;
339 }
340 &s[..end]
341}
342
343fn capitalize_first(s: &str) -> String {
344 let mut chars = s.chars();
345 match chars.next() {
346 None => String::new(),
347 Some(c) => c.to_uppercase().to_string() + chars.as_str(),
348 }
349}
350
351#[cfg(test)]
352mod tests {
353 use super::*;
354 use crate::oracle::coursera::types::TranscriptSegment;
355
356 fn make_transcript(text: &str) -> TranscriptInput {
357 TranscriptInput {
358 text: text.to_string(),
359 language: "en".to_string(),
360 segments: vec![],
361 source_path: "test.txt".to_string(),
362 }
363 }
364
365 #[test]
366 fn test_generate_key_concepts() {
367 let t = make_transcript(
368 "MLOps is the practice of deploying ML models to production. \
369 MLOps combines DevOps and machine learning workflows. \
370 CI/CD pipelines automate the deployment process. \
371 CI/CD is essential for reliable software delivery. \
372 GPU acceleration speeds up model inference. \
373 GPU computing enables parallel processing.",
374 );
375 let reading = generate_key_concepts(&t);
376 assert!(!reading.concepts.is_empty());
377 }
378
379 #[test]
380 fn test_generate_key_concepts_empty() {
381 let t = make_transcript("The cat sat on the mat.");
382 let reading = generate_key_concepts(&t);
383 assert!(reading.concepts.is_empty());
384 }
385
386 #[test]
387 fn test_render_key_concepts_markdown() {
388 let reading = KeyConceptsReading {
389 concepts: vec![Concept {
390 term: "MLOps".to_string(),
391 definition: "Machine Learning Operations".to_string(),
392 context: "MLOps combines ML and DevOps.".to_string(),
393 category: ConceptCategory::Pattern,
394 }],
395 code_examples: vec![CodeExample {
396 language: "bash".to_string(),
397 code: "docker run app".to_string(),
398 related_concept: "Docker".to_string(),
399 }],
400 };
401 let md = render_key_concepts_markdown(&reading);
402 assert!(md.contains("# Key Concepts"));
403 assert!(md.contains("MLOps"));
404 assert!(md.contains("## Code Examples"));
405 assert!(md.contains("```bash"));
406 }
407
408 #[test]
409 fn test_render_key_concepts_empty() {
410 let reading = KeyConceptsReading { concepts: vec![], code_examples: vec![] };
411 let md = render_key_concepts_markdown(&reading);
412 assert!(md.contains("No key concepts extracted"));
413 }
414
415 #[test]
416 fn test_extract_code_examples_bash() {
417 let t = make_transcript(
418 "We use docker to deploy our models. Docker containers are lightweight.",
419 );
420 let concepts = vec![Concept {
421 term: "Docker".to_string(),
422 definition: "Container runtime".to_string(),
423 context: "".to_string(),
424 category: ConceptCategory::Tool,
425 }];
426 let examples = extract_code_examples(&t, &concepts);
427 assert!(!examples.is_empty());
428 assert_eq!(examples[0].language, "bash");
429 }
430
431 #[test]
432 fn test_extract_code_examples_python() {
433 let t = make_transcript("Python and PyTorch are used for model training. Python scripts handle data processing.");
434 let concepts = vec![];
435 let examples = extract_code_examples(&t, &concepts);
436 let python_example = examples.iter().find(|e| e.language == "python");
437 assert!(python_example.is_some());
438 }
439
440 #[test]
441 fn test_extract_code_examples_rust() {
442 let t = make_transcript(
443 "Rust and cargo are used for high-performance computing. Rust provides memory safety.",
444 );
445 let concepts = vec![];
446 let examples = extract_code_examples(&t, &concepts);
447 let rust_example = examples.iter().find(|e| e.language == "rust");
448 assert!(rust_example.is_some());
449 }
450
451 #[test]
452 fn test_try_extract_definition_pattern() {
453 let result = try_extract_definition_pattern(
454 "Batch Normalization is a technique that normalizes layer inputs.",
455 );
456 assert!(result.is_some());
457 let concept = result.expect("operation failed");
458 assert!(concept.term.contains("Normalization"));
459 }
460
461 #[test]
462 fn test_duplicate_terms_deduplicated() {
463 let t = make_transcript(
465 "MLOps is the practice of deploying ML models. MLOps automates deployment. \
466 MLOps combines DevOps and ML. MLOps pipelines handle continuous delivery. \
467 MLOps teams build reliable systems.",
468 );
469 let reading = generate_key_concepts(&t);
470 let mlops_count =
471 reading.concepts.iter().filter(|c| c.term.to_lowercase() == "mlops").count();
472 assert!(mlops_count <= 1, "MLOps should appear at most once");
473 }
474
475 #[test]
476 fn test_derive_concept_definition_is_pattern() {
477 let sentences =
479 vec!["Kubernetes is an open-source container orchestration platform.".to_string()];
480 let def = super::derive_concept_definition(&sentences, "Kubernetes");
481 assert!(def.contains("open-source") || def.contains("container"), "Got: {def}");
482 }
483
484 #[test]
485 fn test_derive_concept_definition_also_known_as() {
486 let sentences = vec!["K8s, also known as Kubernetes container orchestration.".to_string()];
488 let def = super::derive_concept_definition(&sentences, "K8s");
489 assert!(def.starts_with("Also known as"), "Got: {def}");
490 }
491
492 #[test]
493 fn test_derive_concept_definition_fallback() {
494 let sentences = vec!["Random text about something.".to_string()];
496 let def = super::derive_concept_definition(&sentences, "QUIC");
497 assert!(def.contains("Technical concept: QUIC"), "Got: {def}");
498 }
499
500 #[test]
501 fn test_find_best_context_no_match() {
502 let sentences = vec!["The cat sat on the mat.".to_string()];
504 let ctx = super::find_best_context(&sentences, "kubernetes");
505 assert!(ctx.is_empty());
506 }
507
508 #[test]
509 fn test_try_match_definition_short_term_rejected() {
510 let result = super::try_match_definition("It is a test.", "it is a test.", " is a ");
512 assert!(result.is_none());
513 }
514
515 #[test]
516 fn test_try_match_definition_short_definition_rejected() {
517 let result = try_extract_definition_pattern("BigThing is a ok.");
519 assert!(result.is_none());
520 }
521
522 #[test]
523 fn test_truncate_long_string() {
524 let long = "a".repeat(200);
526 let result = super::truncate(&long, 50);
527 assert!(result.ends_with("..."));
528 assert!(result.len() <= 54); }
530
531 #[test]
532 fn test_safe_truncate_bytes_multibyte() {
533 let s = "héllo wörld";
535 let truncated = super::safe_truncate_bytes(s, 3);
536 assert!(!truncated.is_empty());
538 assert!(s.is_char_boundary(truncated.len()));
539 }
540
541 #[test]
542 fn test_capitalize_first_empty() {
543 assert_eq!(super::capitalize_first(""), "");
545 }
546
547 #[test]
548 fn test_split_sentences_trailing_text() {
549 let sentences = super::split_sentences("Hello world. This has no period");
551 assert_eq!(sentences.len(), 2);
552 assert_eq!(sentences[1], "This has no period");
553 }
554
555 #[test]
556 fn test_definition_pattern_refers_to() {
557 let result =
559 try_extract_definition_pattern("MLOps refers to the practice of operationalizing ML.");
560 assert!(result.is_some());
561 let concept = result.expect("operation failed");
562 assert!(
563 concept.definition.contains("practice")
564 || concept.definition.contains("operationalizing")
565 );
566 }
567
568 #[test]
569 fn test_concepts_with_segments() {
570 let t = TranscriptInput {
571 text: "API endpoints serve ML predictions. The API handles inference. \
572 GPU acceleration is critical. GPU kernels run fast."
573 .to_string(),
574 language: "en".to_string(),
575 segments: vec![TranscriptSegment {
576 start: 0.0,
577 end: 10.0,
578 text: "API endpoints serve ML predictions.".to_string(),
579 }],
580 source_path: "test.json".to_string(),
581 };
582 let reading = generate_key_concepts(&t);
583 assert!(!reading.concepts.is_empty());
584 }
585}