batuta/oracle/coursera/
vocabulary.rs

1//! Vocabulary extraction from transcripts
2//!
3//! Detects technical terms via: uppercase mid-word, acronyms, known suffixes,
4//! and frequency >= 2. Organizes by category with context extraction.
5
6use std::collections::HashMap;
7
8use super::transcript::format_timestamp;
9use super::types::{ConceptCategory, TranscriptInput, VocabularyEntry};
10
11/// Known technical suffixes that indicate domain terms
12const TECH_SUFFIXES: &[&str] =
13    &["ization", "isation", "ment", "tion", "sion", "ness", "ity", "ence", "ance"];
14
15/// Known acronyms / uppercase terms (case-insensitive lookup set)
16const KNOWN_ACRONYMS: &[&str] = &[
17    "API",
18    "GPU",
19    "CPU",
20    "TPU",
21    "ML",
22    "AI",
23    "NLP",
24    "CNN",
25    "RNN",
26    "GAN",
27    "LLM",
28    "BERT",
29    "GPT",
30    "LSTM",
31    "GRU",
32    "RLHF",
33    "RAG",
34    "SIMD",
35    "AVX",
36    "NEON",
37    "REST",
38    "HTTP",
39    "HTTPS",
40    "JSON",
41    "YAML",
42    "TOML",
43    "SQL",
44    "CI",
45    "CD",
46    "MLOps",
47    "DevOps",
48    "AWS",
49    "GCP",
50    "CLI",
51    "SDK",
52    "TDD",
53    "BDD",
54    "OOP",
55    "WASM",
56    "CUDA",
57    "MCP",
58    "SSE",
59    "TLS",
60    "TCP",
61    "UDP",
62    "DNS",
63    "SSH",
64    "GGUF",
65    "LoRA",
66    "QLoRA",
67    "GPTQ",
68    "AWQ",
69    "KV",
70    "LZ4",
71    "ZSTD",
72    "Docker",
73    "Kubernetes",
74    "K8s",
75    "ECS",
76    "S3",
77    "EC2",
78    "Lambda",
79    "PyTorch",
80    "TensorFlow",
81    "NumPy",
82    "SciPy",
83    "Pandas",
84    "Sklearn",
85    "HuggingFace",
86    "SafeTensors",
87    "Parquet",
88    "Arrow",
89    "Kafka",
90    "NCCL",
91    "MPI",
92    "RPC",
93    "gRPC",
94    "OAuth",
95    "JWT",
96    "RBAC",
97];
98
99/// Process a single transcript into the shared term accumulator.
100fn accumulate_transcript(
101    transcript: &TranscriptInput,
102    term_data: &mut HashMap<String, TermAccumulator>,
103) {
104    let sentences = split_sentences(&transcript.text);
105
106    for (i, sentence) in sentences.iter().enumerate() {
107        let words = extract_candidate_terms(sentence);
108
109        for word in &words {
110            let normalized = normalize_term(word);
111            if normalized.len() < 2 || is_stop_word(&normalized) {
112                continue;
113            }
114
115            let entry = term_data.entry(normalized.clone()).or_insert_with(|| {
116                let timestamp = find_timestamp_for_sentence(transcript, i, &sentences);
117                TermAccumulator {
118                    original_form: word.clone(),
119                    first_occurrence: timestamp,
120                    frequency: 0,
121                    contexts: Vec::new(),
122                    source: transcript.source_path.clone(),
123                }
124            });
125
126            entry.frequency += 1;
127            if entry.contexts.len() < 3 {
128                entry.contexts.push(sentence.trim().to_string());
129            }
130        }
131    }
132}
133
134/// Extract vocabulary from multiple transcripts.
135pub fn extract_vocabulary(transcripts: &[TranscriptInput]) -> Vec<VocabularyEntry> {
136    let mut term_data: HashMap<String, TermAccumulator> = HashMap::new();
137
138    for transcript in transcripts {
139        accumulate_transcript(transcript, &mut term_data);
140    }
141
142    let mut entries: Vec<VocabularyEntry> = term_data
143        .into_iter()
144        .filter(|(term, acc)| acc.frequency >= 2 || is_known_acronym(term))
145        .map(|(term, acc)| {
146            let category = categorize_term(&term);
147            let definition = derive_definition(&acc.contexts, &term);
148            VocabularyEntry {
149                term: acc.original_form,
150                definition,
151                first_occurrence: acc.first_occurrence,
152                frequency: acc.frequency,
153                category,
154            }
155        })
156        .collect();
157
158    entries.sort_by(|a, b| b.frequency.cmp(&a.frequency));
159    entries
160}
161
162/// Render vocabulary entries as Markdown.
163pub fn render_vocabulary_markdown(entries: &[VocabularyEntry]) -> String {
164    let mut md = String::new();
165    md.push_str("# Course Vocabulary\n\n");
166
167    if entries.is_empty() {
168        md.push_str("No vocabulary terms extracted.\n");
169        return md;
170    }
171
172    // Group by category
173    let mut by_category: HashMap<&str, Vec<&VocabularyEntry>> = HashMap::new();
174    for entry in entries {
175        by_category.entry(entry.category.as_str()).or_default().push(entry);
176    }
177
178    // Sort categories for deterministic output
179    let mut categories: Vec<&&str> = by_category.keys().collect();
180    categories.sort();
181
182    for cat in categories {
183        let cat_entries = &by_category[*cat];
184        md.push_str(&format!("## {}\n\n", cat));
185        md.push_str("| Term | Definition | Frequency | First Seen |\n");
186        md.push_str("|------|-----------|-----------|------------|\n");
187
188        for entry in cat_entries {
189            md.push_str(&format!(
190                "| **{}** | {} | {} | {} |\n",
191                entry.term, entry.definition, entry.frequency, entry.first_occurrence,
192            ));
193        }
194        md.push('\n');
195    }
196
197    md
198}
199
200// ============================================================================
201// Internal helpers
202// ============================================================================
203
204struct TermAccumulator {
205    original_form: String,
206    first_occurrence: String,
207    frequency: usize,
208    contexts: Vec<String>,
209    source: String,
210}
211
212fn split_sentences(text: &str) -> Vec<String> {
213    // Split on sentence-ending punctuation followed by whitespace or EOL
214    let mut sentences = Vec::new();
215    let mut current = String::new();
216
217    for ch in text.chars() {
218        current.push(ch);
219        if matches!(ch, '.' | '!' | '?') {
220            let trimmed = current.trim().to_string();
221            if !trimmed.is_empty() {
222                sentences.push(trimmed);
223            }
224            current.clear();
225        }
226    }
227
228    let trimmed = current.trim().to_string();
229    if !trimmed.is_empty() {
230        sentences.push(trimmed);
231    }
232
233    sentences
234}
235
236fn extract_candidate_terms(sentence: &str) -> Vec<String> {
237    let mut terms = Vec::new();
238
239    for word in sentence.split_whitespace() {
240        let cleaned = word.trim_matches(|c: char| !c.is_alphanumeric() && c != '-' && c != '_');
241
242        if cleaned.is_empty() {
243            continue;
244        }
245
246        // Check if it looks technical
247        if is_technical_word(cleaned) {
248            terms.push(cleaned.to_string());
249        }
250    }
251
252    // Also extract hyphenated compounds
253    for window in sentence.split_whitespace().collect::<Vec<_>>().windows(3) {
254        if window.len() == 3 && window[1] == "-" {
255            let compound = format!("{}-{}", window[0], window[2]);
256            let cleaned = compound.trim_matches(|c: char| !c.is_alphanumeric() && c != '-');
257            if cleaned.len() > 3 {
258                terms.push(cleaned.to_string());
259            }
260        }
261    }
262
263    terms
264}
265
266fn is_technical_word(word: &str) -> bool {
267    // All caps (acronym): ML, API, GPU
268    if word.len() >= 2 && word.chars().all(|c| c.is_ascii_uppercase() || c.is_ascii_digit()) {
269        return true;
270    }
271
272    // CamelCase or mixed case mid-word: MLOps, DevOps, PyTorch
273    let has_mid_upper = word.chars().enumerate().any(|(i, c)| {
274        i > 0 && c.is_uppercase() && word.chars().nth(i - 1).is_some_and(|p| p.is_lowercase())
275    });
276    if has_mid_upper {
277        return true;
278    }
279
280    // Contains hyphen (compound term): cross-validation, pre-training
281    if word.contains('-') && word.len() > 5 {
282        return true;
283    }
284
285    // Known acronym match
286    if is_known_acronym(word) {
287        return true;
288    }
289
290    // Technical suffix
291    let lower = word.to_lowercase();
292    if TECH_SUFFIXES.iter().any(|s| lower.ends_with(s)) && word.len() > 6 {
293        return true;
294    }
295
296    false
297}
298
299fn is_known_acronym(word: &str) -> bool {
300    let lower = word.to_lowercase();
301    KNOWN_ACRONYMS.iter().any(|a| a.to_lowercase() == lower)
302}
303
304fn normalize_term(word: &str) -> String {
305    // Preserve original casing for acronyms, lowercase for others
306    if word.chars().all(|c| c.is_ascii_uppercase() || c.is_ascii_digit()) {
307        word.to_string()
308    } else {
309        word.to_lowercase()
310    }
311}
312
313fn is_stop_word(word: &str) -> bool {
314    const STOP: &[&str] = &[
315        "the",
316        "a",
317        "an",
318        "is",
319        "are",
320        "was",
321        "were",
322        "be",
323        "been",
324        "being",
325        "have",
326        "has",
327        "had",
328        "do",
329        "does",
330        "did",
331        "will",
332        "would",
333        "could",
334        "should",
335        "may",
336        "might",
337        "can",
338        "shall",
339        "to",
340        "of",
341        "in",
342        "for",
343        "on",
344        "with",
345        "at",
346        "by",
347        "from",
348        "as",
349        "or",
350        "and",
351        "but",
352        "if",
353        "not",
354        "no",
355        "so",
356        "up",
357        "out",
358        "it",
359        "its",
360        "this",
361        "that",
362        "these",
363        "those",
364        "we",
365        "you",
366        "they",
367        "he",
368        "she",
369        "my",
370        "your",
371        "our",
372        "us",
373        "all",
374        "each",
375        "every",
376        "both",
377        "few",
378        "more",
379        "most",
380        "other",
381        "some",
382        "such",
383        "than",
384        "too",
385        "very",
386        "just",
387        "also",
388        "about",
389        "which",
390        "what",
391        "when",
392        "where",
393        "how",
394        "who",
395        "whom",
396        "why",
397        "into",
398        "through",
399        "during",
400        "before",
401        "after",
402        "above",
403        "below",
404        "between",
405        "same",
406        "different",
407        "then",
408        "there",
409        "here",
410        "new",
411        "old",
412        "many",
413        "much",
414        "own",
415        "only",
416        "well",
417    ];
418    STOP.contains(&word.to_lowercase().as_str())
419}
420
421fn find_timestamp_for_sentence(
422    transcript: &TranscriptInput,
423    sentence_idx: usize,
424    sentences: &[String],
425) -> String {
426    if transcript.segments.is_empty() {
427        return format!("sentence {}", sentence_idx + 1);
428    }
429
430    // Approximate: find which segment contains this sentence
431    let target_sentence = &sentences[sentence_idx];
432    for seg in &transcript.segments {
433        if seg.text.contains(target_sentence.split_whitespace().next().unwrap_or(""))
434            || target_sentence.contains(seg.text.split_whitespace().next().unwrap_or(""))
435        {
436            return format_timestamp(seg.start);
437        }
438    }
439
440    // Fallback: estimate based on sentence position
441    if let Some(last_seg) = transcript.segments.last() {
442        let ratio = sentence_idx as f64 / sentences.len().max(1) as f64;
443        let estimated_time = ratio * last_seg.end;
444        return format_timestamp(estimated_time);
445    }
446
447    format!("sentence {}", sentence_idx + 1)
448}
449
450fn categorize_term(term: &str) -> ConceptCategory {
451    let lower = term.to_lowercase();
452
453    // Tool patterns
454    if KNOWN_ACRONYMS.iter().any(|a| {
455        let al = a.to_lowercase();
456        al == lower
457            && matches!(
458                al.as_str(),
459                "docker"
460                    | "kubernetes"
461                    | "k8s"
462                    | "pytorch"
463                    | "tensorflow"
464                    | "numpy"
465                    | "scipy"
466                    | "pandas"
467                    | "sklearn"
468                    | "kafka"
469                    | "huggingface"
470                    | "mlflow"
471            )
472    }) {
473        return ConceptCategory::Tool;
474    }
475
476    // Algorithm patterns
477    let algo_keywords = [
478        "sort",
479        "search",
480        "gradient",
481        "descent",
482        "backprop",
483        "boosting",
484        "regression",
485        "classification",
486        "clustering",
487        "optimization",
488        "attention",
489        "convolution",
490        "pooling",
491        "softmax",
492        "normalization",
493    ];
494    if algo_keywords.iter().any(|k| lower.contains(k)) {
495        return ConceptCategory::Algorithm;
496    }
497
498    // Data structure patterns
499    let ds_keywords =
500        ["tree", "graph", "array", "tensor", "matrix", "vector", "queue", "stack", "hash", "cache"];
501    if ds_keywords.iter().any(|k| lower.contains(k)) {
502        return ConceptCategory::DataStructure;
503    }
504
505    // Metric patterns
506    let metric_keywords = [
507        "accuracy",
508        "precision",
509        "recall",
510        "f1",
511        "loss",
512        "score",
513        "metric",
514        "perplexity",
515        "bleu",
516        "rouge",
517        "latency",
518        "throughput",
519    ];
520    if metric_keywords.iter().any(|k| lower.contains(k)) {
521        return ConceptCategory::Metric;
522    }
523
524    // Pattern keywords
525    let pattern_keywords = [
526        "pattern",
527        "pipeline",
528        "workflow",
529        "architecture",
530        "design",
531        "ops",
532        "devops",
533        "mlops",
534        "ci/cd",
535        "microservice",
536    ];
537    if pattern_keywords.iter().any(|k| lower.contains(k)) {
538        return ConceptCategory::Pattern;
539    }
540
541    ConceptCategory::General
542}
543
544/// Try to extract a definition from a context string using "X is ..." or "X refers to ..." patterns.
545fn try_extract_definition(ctx: &str, lower_term: &str) -> Option<String> {
546    let lower_ctx = ctx.to_lowercase();
547
548    if let Some(pos) = lower_ctx.find(&format!("{} is ", lower_term)) {
549        let start = pos + lower_term.len() + 4;
550        if let Some(def) = ctx.get(start..) {
551            let end = def.find('.').unwrap_or(def.len()).min(120);
552            return Some(capitalize_first(safe_truncate_bytes(def, end).trim()));
553        }
554    }
555
556    if let Some(pos) = lower_ctx.find(&format!("{} refers to ", lower_term)) {
557        let start = pos + lower_term.len() + 11;
558        if let Some(def) = ctx.get(start..) {
559            let end = def.find('.').unwrap_or(def.len()).min(120);
560            return Some(capitalize_first(safe_truncate_bytes(def, end).trim()));
561        }
562    }
563
564    None
565}
566
567fn derive_definition(contexts: &[String], term: &str) -> String {
568    let lower_term = term.to_lowercase();
569
570    for ctx in contexts {
571        if let Some(def) = try_extract_definition(ctx, &lower_term) {
572            return def;
573        }
574    }
575
576    if let Some(first) = contexts.first() {
577        return if first.len() > 100 {
578            format!("{}...", safe_truncate_bytes(first, 100))
579        } else {
580            first.clone()
581        };
582    }
583
584    format!("Technical term: {term}")
585}
586
587fn capitalize_first(s: &str) -> String {
588    let mut chars = s.chars();
589    match chars.next() {
590        None => String::new(),
591        Some(c) => c.to_uppercase().to_string() + chars.as_str(),
592    }
593}
594
595/// Truncate a string at the nearest char boundary at or before `max_bytes`.
596fn safe_truncate_bytes(s: &str, max_bytes: usize) -> &str {
597    if max_bytes >= s.len() {
598        return s;
599    }
600    let mut end = max_bytes;
601    while end > 0 && !s.is_char_boundary(end) {
602        end -= 1;
603    }
604    &s[..end]
605}
606
607#[cfg(test)]
608mod tests {
609    use super::*;
610    use crate::oracle::coursera::types::TranscriptSegment;
611
612    fn make_transcript(text: &str) -> TranscriptInput {
613        TranscriptInput {
614            text: text.to_string(),
615            language: "en".to_string(),
616            segments: vec![],
617            source_path: "test.txt".to_string(),
618        }
619    }
620
621    #[test]
622    fn test_extract_vocabulary_basic() {
623        let t = make_transcript(
624            "MLOps combines ML and DevOps. MLOps is the practice of deploying ML models. \
625             DevOps principles apply to ML workflows. API endpoints serve predictions. \
626             The API handles inference requests.",
627        );
628        let entries = extract_vocabulary(&[t]);
629        assert!(!entries.is_empty());
630
631        let mlops = entries.iter().find(|e| e.term.to_lowercase() == "mlops");
632        assert!(mlops.is_some(), "Should find MLOps");
633        assert!(mlops.expect("unexpected failure").frequency >= 2);
634    }
635
636    #[test]
637    fn test_extract_vocabulary_empty() {
638        let entries = extract_vocabulary(&[]);
639        assert!(entries.is_empty());
640    }
641
642    #[test]
643    fn test_extract_vocabulary_no_technical_terms() {
644        let t = make_transcript("The cat sat on the mat. It was a good day.");
645        let entries = extract_vocabulary(&[t]);
646        assert!(entries.is_empty());
647    }
648
649    #[test]
650    fn test_render_vocabulary_markdown() {
651        let entries = vec![VocabularyEntry {
652            term: "MLOps".to_string(),
653            definition: "Machine Learning Operations".to_string(),
654            first_occurrence: "0:05".to_string(),
655            frequency: 5,
656            category: ConceptCategory::Pattern,
657        }];
658        let md = render_vocabulary_markdown(&entries);
659        assert!(md.contains("# Course Vocabulary"));
660        assert!(md.contains("MLOps"));
661        assert!(md.contains("Machine Learning Operations"));
662        assert!(md.contains("Pattern"));
663    }
664
665    #[test]
666    fn test_render_vocabulary_markdown_empty() {
667        let md = render_vocabulary_markdown(&[]);
668        assert!(md.contains("No vocabulary terms extracted"));
669    }
670
671    #[test]
672    fn test_is_technical_word() {
673        assert!(is_technical_word("API"));
674        assert!(is_technical_word("MLOps"));
675        assert!(is_technical_word("DevOps"));
676        assert!(is_technical_word("pre-training"));
677        assert!(!is_technical_word("the"));
678        assert!(!is_technical_word("good"));
679    }
680
681    #[test]
682    fn test_categorize_term() {
683        assert_eq!(categorize_term("gradient descent"), ConceptCategory::Algorithm);
684        assert_eq!(categorize_term("tensor"), ConceptCategory::DataStructure);
685        assert_eq!(categorize_term("accuracy"), ConceptCategory::Metric);
686        assert_eq!(categorize_term("pipeline"), ConceptCategory::Pattern);
687    }
688
689    #[test]
690    fn test_split_sentences() {
691        let sentences = split_sentences("Hello world. How are you? Fine!");
692        assert_eq!(sentences.len(), 3);
693    }
694
695    #[test]
696    fn test_vocabulary_with_segments() {
697        let t = TranscriptInput {
698            text: "GPU acceleration is important. GPU kernels run SIMD operations. \
699                   GPU computing enables parallel workloads."
700                .to_string(),
701            language: "en".to_string(),
702            segments: vec![
703                TranscriptSegment {
704                    start: 0.0,
705                    end: 5.0,
706                    text: "GPU acceleration is important.".to_string(),
707                },
708                TranscriptSegment {
709                    start: 5.0,
710                    end: 10.0,
711                    text: "GPU kernels run SIMD operations.".to_string(),
712                },
713            ],
714            source_path: "lesson.json".to_string(),
715        };
716        let entries = extract_vocabulary(&[t]);
717        let gpu = entries.iter().find(|e| e.term == "GPU");
718        assert!(gpu.is_some());
719    }
720
721    #[test]
722    fn test_derive_definition_pattern() {
723        let contexts =
724            vec!["MLOps is the practice of deploying ML models in production.".to_string()];
725        let def = derive_definition(&contexts, "mlops");
726        assert!(def.contains("practice"), "Got: {def}");
727    }
728
729    #[test]
730    fn test_normalize_term() {
731        assert_eq!(normalize_term("API"), "API");
732        assert_eq!(normalize_term("DevOps"), "devops");
733    }
734
735    #[test]
736    fn test_derive_definition_refers_to() {
737        // Triggers "refers to" pattern (lines 569-574)
738        let contexts =
739            vec!["MLOps refers to the practice of deploying ML models in production.".to_string()];
740        let def = derive_definition(&contexts, "mlops");
741        assert!(def.contains("practice") || def.contains("deploying"), "Got: {def}");
742    }
743
744    #[test]
745    fn test_derive_definition_fallback_long_context() {
746        // Triggers long context truncation fallback (line 580-581)
747        let long = format!("{}. More text follows.", "A".repeat(120));
748        let contexts = vec![long];
749        let def = derive_definition(&contexts, "nonexistentterm");
750        assert!(def.contains("..."), "Got: {def}");
751    }
752
753    #[test]
754    fn test_derive_definition_no_contexts() {
755        // Triggers empty contexts fallback (line 588)
756        let def = derive_definition(&[], "SomeTerm");
757        assert_eq!(def, "Technical term: SomeTerm");
758    }
759
760    #[test]
761    fn test_categorize_term_tool() {
762        // Triggers Tool match (lines 461-481)
763        assert_eq!(categorize_term("Docker"), ConceptCategory::Tool);
764        assert_eq!(categorize_term("Kubernetes"), ConceptCategory::Tool);
765        assert_eq!(categorize_term("PyTorch"), ConceptCategory::Tool);
766    }
767
768    #[test]
769    fn test_hyphenated_compound_extraction() {
770        // Triggers hyphenated compound extraction (lines 251-258)
771        let terms = extract_candidate_terms("The cross - validation technique is used");
772        assert!(
773            terms.iter().any(|t| t.contains("cross") && t.contains("validation")),
774            "Terms: {:?}",
775            terms
776        );
777    }
778
779    #[test]
780    fn test_find_timestamp_estimated_fallback() {
781        // Triggers estimated timestamp fallback (lines 448-452)
782        let t = TranscriptInput {
783            text: "First sentence. Second sentence. Third sentence.".to_string(),
784            language: "en".to_string(),
785            segments: vec![
786                TranscriptSegment {
787                    start: 0.0,
788                    end: 10.0,
789                    text: "Unrelated segment text here".to_string(),
790                },
791                TranscriptSegment {
792                    start: 10.0,
793                    end: 30.0,
794                    text: "Another unrelated segment".to_string(),
795                },
796            ],
797            source_path: "test.json".to_string(),
798        };
799        let sentences = split_sentences(&t.text);
800        // Third sentence won't match any segment, triggering estimated fallback
801        let ts = find_timestamp_for_sentence(&t, 2, &sentences);
802        // Should produce a time-based estimate, not "sentence 3"
803        assert!(ts.contains(':'), "Expected timestamp, got: {ts}");
804    }
805
806    #[test]
807    fn test_capitalize_first_empty() {
808        assert_eq!(capitalize_first(""), "");
809    }
810
811    #[test]
812    fn test_safe_truncate_bytes_multibyte() {
813        // Triggers char boundary adjustment loop (lines 604-607)
814        let s = "café résumé";
815        let truncated = safe_truncate_bytes(s, 4);
816        // 'é' is 2 bytes, position 4 should be safe but let's test the boundary
817        assert!(!truncated.is_empty());
818        assert!(s.is_char_boundary(truncated.len()));
819    }
820
821    #[test]
822    fn test_stop_word_filtering() {
823        // Triggers continue on stop words (line 113)
824        assert!(is_stop_word("the"));
825        assert!(is_stop_word("and"));
826        assert!(!is_stop_word("kubernetes"));
827    }
828
829    #[test]
830    fn test_extract_candidate_terms_empty_word() {
831        // Triggers empty word skip (line 240-242)
832        let terms = extract_candidate_terms("... --- *** plain text");
833        // None of these should produce technical terms
834        assert!(!terms.iter().any(|t| t == "..." || t == "---" || t == "***"),);
835    }
836
837    #[test]
838    fn test_derive_definition_short_context() {
839        // Triggers short context fallback (line 583 - context.len() <= 100)
840        let contexts = vec!["A brief context.".to_string()];
841        let def = derive_definition(&contexts, "xyzterm");
842        assert_eq!(def, "A brief context.");
843    }
844}
batuta/oracle/coursera/vocabulary.rs

batuta/oracle/coursera/
vocabulary.rs