1use std::collections::HashMap;
7
8use super::transcript::format_timestamp;
9use super::types::{ConceptCategory, TranscriptInput, VocabularyEntry};
10
11const TECH_SUFFIXES: &[&str] =
13 &["ization", "isation", "ment", "tion", "sion", "ness", "ity", "ence", "ance"];
14
15const KNOWN_ACRONYMS: &[&str] = &[
17 "API",
18 "GPU",
19 "CPU",
20 "TPU",
21 "ML",
22 "AI",
23 "NLP",
24 "CNN",
25 "RNN",
26 "GAN",
27 "LLM",
28 "BERT",
29 "GPT",
30 "LSTM",
31 "GRU",
32 "RLHF",
33 "RAG",
34 "SIMD",
35 "AVX",
36 "NEON",
37 "REST",
38 "HTTP",
39 "HTTPS",
40 "JSON",
41 "YAML",
42 "TOML",
43 "SQL",
44 "CI",
45 "CD",
46 "MLOps",
47 "DevOps",
48 "AWS",
49 "GCP",
50 "CLI",
51 "SDK",
52 "TDD",
53 "BDD",
54 "OOP",
55 "WASM",
56 "CUDA",
57 "MCP",
58 "SSE",
59 "TLS",
60 "TCP",
61 "UDP",
62 "DNS",
63 "SSH",
64 "GGUF",
65 "LoRA",
66 "QLoRA",
67 "GPTQ",
68 "AWQ",
69 "KV",
70 "LZ4",
71 "ZSTD",
72 "Docker",
73 "Kubernetes",
74 "K8s",
75 "ECS",
76 "S3",
77 "EC2",
78 "Lambda",
79 "PyTorch",
80 "TensorFlow",
81 "NumPy",
82 "SciPy",
83 "Pandas",
84 "Sklearn",
85 "HuggingFace",
86 "SafeTensors",
87 "Parquet",
88 "Arrow",
89 "Kafka",
90 "NCCL",
91 "MPI",
92 "RPC",
93 "gRPC",
94 "OAuth",
95 "JWT",
96 "RBAC",
97];
98
99fn accumulate_transcript(
101 transcript: &TranscriptInput,
102 term_data: &mut HashMap<String, TermAccumulator>,
103) {
104 let sentences = split_sentences(&transcript.text);
105
106 for (i, sentence) in sentences.iter().enumerate() {
107 let words = extract_candidate_terms(sentence);
108
109 for word in &words {
110 let normalized = normalize_term(word);
111 if normalized.len() < 2 || is_stop_word(&normalized) {
112 continue;
113 }
114
115 let entry = term_data.entry(normalized.clone()).or_insert_with(|| {
116 let timestamp = find_timestamp_for_sentence(transcript, i, &sentences);
117 TermAccumulator {
118 original_form: word.clone(),
119 first_occurrence: timestamp,
120 frequency: 0,
121 contexts: Vec::new(),
122 source: transcript.source_path.clone(),
123 }
124 });
125
126 entry.frequency += 1;
127 if entry.contexts.len() < 3 {
128 entry.contexts.push(sentence.trim().to_string());
129 }
130 }
131 }
132}
133
134pub fn extract_vocabulary(transcripts: &[TranscriptInput]) -> Vec<VocabularyEntry> {
136 let mut term_data: HashMap<String, TermAccumulator> = HashMap::new();
137
138 for transcript in transcripts {
139 accumulate_transcript(transcript, &mut term_data);
140 }
141
142 let mut entries: Vec<VocabularyEntry> = term_data
143 .into_iter()
144 .filter(|(term, acc)| acc.frequency >= 2 || is_known_acronym(term))
145 .map(|(term, acc)| {
146 let category = categorize_term(&term);
147 let definition = derive_definition(&acc.contexts, &term);
148 VocabularyEntry {
149 term: acc.original_form,
150 definition,
151 first_occurrence: acc.first_occurrence,
152 frequency: acc.frequency,
153 category,
154 }
155 })
156 .collect();
157
158 entries.sort_by(|a, b| b.frequency.cmp(&a.frequency));
159 entries
160}
161
162pub fn render_vocabulary_markdown(entries: &[VocabularyEntry]) -> String {
164 let mut md = String::new();
165 md.push_str("# Course Vocabulary\n\n");
166
167 if entries.is_empty() {
168 md.push_str("No vocabulary terms extracted.\n");
169 return md;
170 }
171
172 let mut by_category: HashMap<&str, Vec<&VocabularyEntry>> = HashMap::new();
174 for entry in entries {
175 by_category.entry(entry.category.as_str()).or_default().push(entry);
176 }
177
178 let mut categories: Vec<&&str> = by_category.keys().collect();
180 categories.sort();
181
182 for cat in categories {
183 let cat_entries = &by_category[*cat];
184 md.push_str(&format!("## {}\n\n", cat));
185 md.push_str("| Term | Definition | Frequency | First Seen |\n");
186 md.push_str("|------|-----------|-----------|------------|\n");
187
188 for entry in cat_entries {
189 md.push_str(&format!(
190 "| **{}** | {} | {} | {} |\n",
191 entry.term, entry.definition, entry.frequency, entry.first_occurrence,
192 ));
193 }
194 md.push('\n');
195 }
196
197 md
198}
199
200struct TermAccumulator {
205 original_form: String,
206 first_occurrence: String,
207 frequency: usize,
208 contexts: Vec<String>,
209 source: String,
210}
211
212fn split_sentences(text: &str) -> Vec<String> {
213 let mut sentences = Vec::new();
215 let mut current = String::new();
216
217 for ch in text.chars() {
218 current.push(ch);
219 if matches!(ch, '.' | '!' | '?') {
220 let trimmed = current.trim().to_string();
221 if !trimmed.is_empty() {
222 sentences.push(trimmed);
223 }
224 current.clear();
225 }
226 }
227
228 let trimmed = current.trim().to_string();
229 if !trimmed.is_empty() {
230 sentences.push(trimmed);
231 }
232
233 sentences
234}
235
236fn extract_candidate_terms(sentence: &str) -> Vec<String> {
237 let mut terms = Vec::new();
238
239 for word in sentence.split_whitespace() {
240 let cleaned = word.trim_matches(|c: char| !c.is_alphanumeric() && c != '-' && c != '_');
241
242 if cleaned.is_empty() {
243 continue;
244 }
245
246 if is_technical_word(cleaned) {
248 terms.push(cleaned.to_string());
249 }
250 }
251
252 for window in sentence.split_whitespace().collect::<Vec<_>>().windows(3) {
254 if window.len() == 3 && window[1] == "-" {
255 let compound = format!("{}-{}", window[0], window[2]);
256 let cleaned = compound.trim_matches(|c: char| !c.is_alphanumeric() && c != '-');
257 if cleaned.len() > 3 {
258 terms.push(cleaned.to_string());
259 }
260 }
261 }
262
263 terms
264}
265
266fn is_technical_word(word: &str) -> bool {
267 if word.len() >= 2 && word.chars().all(|c| c.is_ascii_uppercase() || c.is_ascii_digit()) {
269 return true;
270 }
271
272 let has_mid_upper = word.chars().enumerate().any(|(i, c)| {
274 i > 0 && c.is_uppercase() && word.chars().nth(i - 1).is_some_and(|p| p.is_lowercase())
275 });
276 if has_mid_upper {
277 return true;
278 }
279
280 if word.contains('-') && word.len() > 5 {
282 return true;
283 }
284
285 if is_known_acronym(word) {
287 return true;
288 }
289
290 let lower = word.to_lowercase();
292 if TECH_SUFFIXES.iter().any(|s| lower.ends_with(s)) && word.len() > 6 {
293 return true;
294 }
295
296 false
297}
298
299fn is_known_acronym(word: &str) -> bool {
300 let lower = word.to_lowercase();
301 KNOWN_ACRONYMS.iter().any(|a| a.to_lowercase() == lower)
302}
303
304fn normalize_term(word: &str) -> String {
305 if word.chars().all(|c| c.is_ascii_uppercase() || c.is_ascii_digit()) {
307 word.to_string()
308 } else {
309 word.to_lowercase()
310 }
311}
312
313fn is_stop_word(word: &str) -> bool {
314 const STOP: &[&str] = &[
315 "the",
316 "a",
317 "an",
318 "is",
319 "are",
320 "was",
321 "were",
322 "be",
323 "been",
324 "being",
325 "have",
326 "has",
327 "had",
328 "do",
329 "does",
330 "did",
331 "will",
332 "would",
333 "could",
334 "should",
335 "may",
336 "might",
337 "can",
338 "shall",
339 "to",
340 "of",
341 "in",
342 "for",
343 "on",
344 "with",
345 "at",
346 "by",
347 "from",
348 "as",
349 "or",
350 "and",
351 "but",
352 "if",
353 "not",
354 "no",
355 "so",
356 "up",
357 "out",
358 "it",
359 "its",
360 "this",
361 "that",
362 "these",
363 "those",
364 "we",
365 "you",
366 "they",
367 "he",
368 "she",
369 "my",
370 "your",
371 "our",
372 "us",
373 "all",
374 "each",
375 "every",
376 "both",
377 "few",
378 "more",
379 "most",
380 "other",
381 "some",
382 "such",
383 "than",
384 "too",
385 "very",
386 "just",
387 "also",
388 "about",
389 "which",
390 "what",
391 "when",
392 "where",
393 "how",
394 "who",
395 "whom",
396 "why",
397 "into",
398 "through",
399 "during",
400 "before",
401 "after",
402 "above",
403 "below",
404 "between",
405 "same",
406 "different",
407 "then",
408 "there",
409 "here",
410 "new",
411 "old",
412 "many",
413 "much",
414 "own",
415 "only",
416 "well",
417 ];
418 STOP.contains(&word.to_lowercase().as_str())
419}
420
421fn find_timestamp_for_sentence(
422 transcript: &TranscriptInput,
423 sentence_idx: usize,
424 sentences: &[String],
425) -> String {
426 if transcript.segments.is_empty() {
427 return format!("sentence {}", sentence_idx + 1);
428 }
429
430 let target_sentence = &sentences[sentence_idx];
432 for seg in &transcript.segments {
433 if seg.text.contains(target_sentence.split_whitespace().next().unwrap_or(""))
434 || target_sentence.contains(seg.text.split_whitespace().next().unwrap_or(""))
435 {
436 return format_timestamp(seg.start);
437 }
438 }
439
440 if let Some(last_seg) = transcript.segments.last() {
442 let ratio = sentence_idx as f64 / sentences.len().max(1) as f64;
443 let estimated_time = ratio * last_seg.end;
444 return format_timestamp(estimated_time);
445 }
446
447 format!("sentence {}", sentence_idx + 1)
448}
449
450fn categorize_term(term: &str) -> ConceptCategory {
451 let lower = term.to_lowercase();
452
453 if KNOWN_ACRONYMS.iter().any(|a| {
455 let al = a.to_lowercase();
456 al == lower
457 && matches!(
458 al.as_str(),
459 "docker"
460 | "kubernetes"
461 | "k8s"
462 | "pytorch"
463 | "tensorflow"
464 | "numpy"
465 | "scipy"
466 | "pandas"
467 | "sklearn"
468 | "kafka"
469 | "huggingface"
470 | "mlflow"
471 )
472 }) {
473 return ConceptCategory::Tool;
474 }
475
476 let algo_keywords = [
478 "sort",
479 "search",
480 "gradient",
481 "descent",
482 "backprop",
483 "boosting",
484 "regression",
485 "classification",
486 "clustering",
487 "optimization",
488 "attention",
489 "convolution",
490 "pooling",
491 "softmax",
492 "normalization",
493 ];
494 if algo_keywords.iter().any(|k| lower.contains(k)) {
495 return ConceptCategory::Algorithm;
496 }
497
498 let ds_keywords =
500 ["tree", "graph", "array", "tensor", "matrix", "vector", "queue", "stack", "hash", "cache"];
501 if ds_keywords.iter().any(|k| lower.contains(k)) {
502 return ConceptCategory::DataStructure;
503 }
504
505 let metric_keywords = [
507 "accuracy",
508 "precision",
509 "recall",
510 "f1",
511 "loss",
512 "score",
513 "metric",
514 "perplexity",
515 "bleu",
516 "rouge",
517 "latency",
518 "throughput",
519 ];
520 if metric_keywords.iter().any(|k| lower.contains(k)) {
521 return ConceptCategory::Metric;
522 }
523
524 let pattern_keywords = [
526 "pattern",
527 "pipeline",
528 "workflow",
529 "architecture",
530 "design",
531 "ops",
532 "devops",
533 "mlops",
534 "ci/cd",
535 "microservice",
536 ];
537 if pattern_keywords.iter().any(|k| lower.contains(k)) {
538 return ConceptCategory::Pattern;
539 }
540
541 ConceptCategory::General
542}
543
544fn try_extract_definition(ctx: &str, lower_term: &str) -> Option<String> {
546 let lower_ctx = ctx.to_lowercase();
547
548 if let Some(pos) = lower_ctx.find(&format!("{} is ", lower_term)) {
549 let start = pos + lower_term.len() + 4;
550 if let Some(def) = ctx.get(start..) {
551 let end = def.find('.').unwrap_or(def.len()).min(120);
552 return Some(capitalize_first(safe_truncate_bytes(def, end).trim()));
553 }
554 }
555
556 if let Some(pos) = lower_ctx.find(&format!("{} refers to ", lower_term)) {
557 let start = pos + lower_term.len() + 11;
558 if let Some(def) = ctx.get(start..) {
559 let end = def.find('.').unwrap_or(def.len()).min(120);
560 return Some(capitalize_first(safe_truncate_bytes(def, end).trim()));
561 }
562 }
563
564 None
565}
566
567fn derive_definition(contexts: &[String], term: &str) -> String {
568 let lower_term = term.to_lowercase();
569
570 for ctx in contexts {
571 if let Some(def) = try_extract_definition(ctx, &lower_term) {
572 return def;
573 }
574 }
575
576 if let Some(first) = contexts.first() {
577 return if first.len() > 100 {
578 format!("{}...", safe_truncate_bytes(first, 100))
579 } else {
580 first.clone()
581 };
582 }
583
584 format!("Technical term: {term}")
585}
586
587fn capitalize_first(s: &str) -> String {
588 let mut chars = s.chars();
589 match chars.next() {
590 None => String::new(),
591 Some(c) => c.to_uppercase().to_string() + chars.as_str(),
592 }
593}
594
595fn safe_truncate_bytes(s: &str, max_bytes: usize) -> &str {
597 if max_bytes >= s.len() {
598 return s;
599 }
600 let mut end = max_bytes;
601 while end > 0 && !s.is_char_boundary(end) {
602 end -= 1;
603 }
604 &s[..end]
605}
606
607#[cfg(test)]
608mod tests {
609 use super::*;
610 use crate::oracle::coursera::types::TranscriptSegment;
611
612 fn make_transcript(text: &str) -> TranscriptInput {
613 TranscriptInput {
614 text: text.to_string(),
615 language: "en".to_string(),
616 segments: vec![],
617 source_path: "test.txt".to_string(),
618 }
619 }
620
621 #[test]
622 fn test_extract_vocabulary_basic() {
623 let t = make_transcript(
624 "MLOps combines ML and DevOps. MLOps is the practice of deploying ML models. \
625 DevOps principles apply to ML workflows. API endpoints serve predictions. \
626 The API handles inference requests.",
627 );
628 let entries = extract_vocabulary(&[t]);
629 assert!(!entries.is_empty());
630
631 let mlops = entries.iter().find(|e| e.term.to_lowercase() == "mlops");
632 assert!(mlops.is_some(), "Should find MLOps");
633 assert!(mlops.expect("unexpected failure").frequency >= 2);
634 }
635
636 #[test]
637 fn test_extract_vocabulary_empty() {
638 let entries = extract_vocabulary(&[]);
639 assert!(entries.is_empty());
640 }
641
642 #[test]
643 fn test_extract_vocabulary_no_technical_terms() {
644 let t = make_transcript("The cat sat on the mat. It was a good day.");
645 let entries = extract_vocabulary(&[t]);
646 assert!(entries.is_empty());
647 }
648
649 #[test]
650 fn test_render_vocabulary_markdown() {
651 let entries = vec![VocabularyEntry {
652 term: "MLOps".to_string(),
653 definition: "Machine Learning Operations".to_string(),
654 first_occurrence: "0:05".to_string(),
655 frequency: 5,
656 category: ConceptCategory::Pattern,
657 }];
658 let md = render_vocabulary_markdown(&entries);
659 assert!(md.contains("# Course Vocabulary"));
660 assert!(md.contains("MLOps"));
661 assert!(md.contains("Machine Learning Operations"));
662 assert!(md.contains("Pattern"));
663 }
664
665 #[test]
666 fn test_render_vocabulary_markdown_empty() {
667 let md = render_vocabulary_markdown(&[]);
668 assert!(md.contains("No vocabulary terms extracted"));
669 }
670
671 #[test]
672 fn test_is_technical_word() {
673 assert!(is_technical_word("API"));
674 assert!(is_technical_word("MLOps"));
675 assert!(is_technical_word("DevOps"));
676 assert!(is_technical_word("pre-training"));
677 assert!(!is_technical_word("the"));
678 assert!(!is_technical_word("good"));
679 }
680
681 #[test]
682 fn test_categorize_term() {
683 assert_eq!(categorize_term("gradient descent"), ConceptCategory::Algorithm);
684 assert_eq!(categorize_term("tensor"), ConceptCategory::DataStructure);
685 assert_eq!(categorize_term("accuracy"), ConceptCategory::Metric);
686 assert_eq!(categorize_term("pipeline"), ConceptCategory::Pattern);
687 }
688
689 #[test]
690 fn test_split_sentences() {
691 let sentences = split_sentences("Hello world. How are you? Fine!");
692 assert_eq!(sentences.len(), 3);
693 }
694
695 #[test]
696 fn test_vocabulary_with_segments() {
697 let t = TranscriptInput {
698 text: "GPU acceleration is important. GPU kernels run SIMD operations. \
699 GPU computing enables parallel workloads."
700 .to_string(),
701 language: "en".to_string(),
702 segments: vec![
703 TranscriptSegment {
704 start: 0.0,
705 end: 5.0,
706 text: "GPU acceleration is important.".to_string(),
707 },
708 TranscriptSegment {
709 start: 5.0,
710 end: 10.0,
711 text: "GPU kernels run SIMD operations.".to_string(),
712 },
713 ],
714 source_path: "lesson.json".to_string(),
715 };
716 let entries = extract_vocabulary(&[t]);
717 let gpu = entries.iter().find(|e| e.term == "GPU");
718 assert!(gpu.is_some());
719 }
720
721 #[test]
722 fn test_derive_definition_pattern() {
723 let contexts =
724 vec!["MLOps is the practice of deploying ML models in production.".to_string()];
725 let def = derive_definition(&contexts, "mlops");
726 assert!(def.contains("practice"), "Got: {def}");
727 }
728
729 #[test]
730 fn test_normalize_term() {
731 assert_eq!(normalize_term("API"), "API");
732 assert_eq!(normalize_term("DevOps"), "devops");
733 }
734
735 #[test]
736 fn test_derive_definition_refers_to() {
737 let contexts =
739 vec!["MLOps refers to the practice of deploying ML models in production.".to_string()];
740 let def = derive_definition(&contexts, "mlops");
741 assert!(def.contains("practice") || def.contains("deploying"), "Got: {def}");
742 }
743
744 #[test]
745 fn test_derive_definition_fallback_long_context() {
746 let long = format!("{}. More text follows.", "A".repeat(120));
748 let contexts = vec![long];
749 let def = derive_definition(&contexts, "nonexistentterm");
750 assert!(def.contains("..."), "Got: {def}");
751 }
752
753 #[test]
754 fn test_derive_definition_no_contexts() {
755 let def = derive_definition(&[], "SomeTerm");
757 assert_eq!(def, "Technical term: SomeTerm");
758 }
759
760 #[test]
761 fn test_categorize_term_tool() {
762 assert_eq!(categorize_term("Docker"), ConceptCategory::Tool);
764 assert_eq!(categorize_term("Kubernetes"), ConceptCategory::Tool);
765 assert_eq!(categorize_term("PyTorch"), ConceptCategory::Tool);
766 }
767
768 #[test]
769 fn test_hyphenated_compound_extraction() {
770 let terms = extract_candidate_terms("The cross - validation technique is used");
772 assert!(
773 terms.iter().any(|t| t.contains("cross") && t.contains("validation")),
774 "Terms: {:?}",
775 terms
776 );
777 }
778
779 #[test]
780 fn test_find_timestamp_estimated_fallback() {
781 let t = TranscriptInput {
783 text: "First sentence. Second sentence. Third sentence.".to_string(),
784 language: "en".to_string(),
785 segments: vec![
786 TranscriptSegment {
787 start: 0.0,
788 end: 10.0,
789 text: "Unrelated segment text here".to_string(),
790 },
791 TranscriptSegment {
792 start: 10.0,
793 end: 30.0,
794 text: "Another unrelated segment".to_string(),
795 },
796 ],
797 source_path: "test.json".to_string(),
798 };
799 let sentences = split_sentences(&t.text);
800 let ts = find_timestamp_for_sentence(&t, 2, &sentences);
802 assert!(ts.contains(':'), "Expected timestamp, got: {ts}");
804 }
805
806 #[test]
807 fn test_capitalize_first_empty() {
808 assert_eq!(capitalize_first(""), "");
809 }
810
811 #[test]
812 fn test_safe_truncate_bytes_multibyte() {
813 let s = "café résumé";
815 let truncated = safe_truncate_bytes(s, 4);
816 assert!(!truncated.is_empty());
818 assert!(s.is_char_boundary(truncated.len()));
819 }
820
821 #[test]
822 fn test_stop_word_filtering() {
823 assert!(is_stop_word("the"));
825 assert!(is_stop_word("and"));
826 assert!(!is_stop_word("kubernetes"));
827 }
828
829 #[test]
830 fn test_extract_candidate_terms_empty_word() {
831 let terms = extract_candidate_terms("... --- *** plain text");
833 assert!(!terms.iter().any(|t| t == "..." || t == "---" || t == "***"),);
835 }
836
837 #[test]
838 fn test_derive_definition_short_context() {
839 let contexts = vec!["A brief context.".to_string()];
841 let def = derive_definition(&contexts, "xyzterm");
842 assert_eq!(def, "A brief context.");
843 }
844}