use std::collections::HashMap;
use super::transcript::format_timestamp;
use super::types::{ConceptCategory, TranscriptInput, VocabularyEntry};
const TECH_SUFFIXES: &[&str] =
&["ization", "isation", "ment", "tion", "sion", "ness", "ity", "ence", "ance"];
const KNOWN_ACRONYMS: &[&str] = &[
"API",
"GPU",
"CPU",
"TPU",
"ML",
"AI",
"NLP",
"CNN",
"RNN",
"GAN",
"LLM",
"BERT",
"GPT",
"LSTM",
"GRU",
"RLHF",
"RAG",
"SIMD",
"AVX",
"NEON",
"REST",
"HTTP",
"HTTPS",
"JSON",
"YAML",
"TOML",
"SQL",
"CI",
"CD",
"MLOps",
"DevOps",
"AWS",
"GCP",
"CLI",
"SDK",
"TDD",
"BDD",
"OOP",
"WASM",
"CUDA",
"MCP",
"SSE",
"TLS",
"TCP",
"UDP",
"DNS",
"SSH",
"GGUF",
"LoRA",
"QLoRA",
"GPTQ",
"AWQ",
"KV",
"LZ4",
"ZSTD",
"Docker",
"Kubernetes",
"K8s",
"ECS",
"S3",
"EC2",
"Lambda",
"PyTorch",
"TensorFlow",
"NumPy",
"SciPy",
"Pandas",
"Sklearn",
"HuggingFace",
"SafeTensors",
"Parquet",
"Arrow",
"Kafka",
"NCCL",
"MPI",
"RPC",
"gRPC",
"OAuth",
"JWT",
"RBAC",
];
fn accumulate_transcript(
transcript: &TranscriptInput,
term_data: &mut HashMap<String, TermAccumulator>,
) {
let sentences = split_sentences(&transcript.text);
for (i, sentence) in sentences.iter().enumerate() {
let words = extract_candidate_terms(sentence);
for word in &words {
let normalized = normalize_term(word);
if normalized.len() < 2 || is_stop_word(&normalized) {
continue;
}
let entry = term_data.entry(normalized.clone()).or_insert_with(|| {
let timestamp = find_timestamp_for_sentence(transcript, i, &sentences);
TermAccumulator {
original_form: word.clone(),
first_occurrence: timestamp,
frequency: 0,
contexts: Vec::new(),
source: transcript.source_path.clone(),
}
});
entry.frequency += 1;
if entry.contexts.len() < 3 {
entry.contexts.push(sentence.trim().to_string());
}
}
}
}
pub fn extract_vocabulary(transcripts: &[TranscriptInput]) -> Vec<VocabularyEntry> {
let mut term_data: HashMap<String, TermAccumulator> = HashMap::new();
for transcript in transcripts {
accumulate_transcript(transcript, &mut term_data);
}
let mut entries: Vec<VocabularyEntry> = term_data
.into_iter()
.filter(|(term, acc)| acc.frequency >= 2 || is_known_acronym(term))
.map(|(term, acc)| {
let category = categorize_term(&term);
let definition = derive_definition(&acc.contexts, &term);
VocabularyEntry {
term: acc.original_form,
definition,
first_occurrence: acc.first_occurrence,
frequency: acc.frequency,
category,
}
})
.collect();
entries.sort_by(|a, b| b.frequency.cmp(&a.frequency));
entries
}
pub fn render_vocabulary_markdown(entries: &[VocabularyEntry]) -> String {
let mut md = String::new();
md.push_str("# Course Vocabulary\n\n");
if entries.is_empty() {
md.push_str("No vocabulary terms extracted.\n");
return md;
}
let mut by_category: HashMap<&str, Vec<&VocabularyEntry>> = HashMap::new();
for entry in entries {
by_category.entry(entry.category.as_str()).or_default().push(entry);
}
let mut categories: Vec<&&str> = by_category.keys().collect();
categories.sort();
for cat in categories {
let cat_entries = &by_category[*cat];
md.push_str(&format!("## {}\n\n", cat));
md.push_str("| Term | Definition | Frequency | First Seen |\n");
md.push_str("|------|-----------|-----------|------------|\n");
for entry in cat_entries {
md.push_str(&format!(
"| **{}** | {} | {} | {} |\n",
entry.term, entry.definition, entry.frequency, entry.first_occurrence,
));
}
md.push('\n');
}
md
}
struct TermAccumulator {
original_form: String,
first_occurrence: String,
frequency: usize,
contexts: Vec<String>,
source: String,
}
fn split_sentences(text: &str) -> Vec<String> {
let mut sentences = Vec::new();
let mut current = String::new();
for ch in text.chars() {
current.push(ch);
if matches!(ch, '.' | '!' | '?') {
let trimmed = current.trim().to_string();
if !trimmed.is_empty() {
sentences.push(trimmed);
}
current.clear();
}
}
let trimmed = current.trim().to_string();
if !trimmed.is_empty() {
sentences.push(trimmed);
}
sentences
}
fn extract_candidate_terms(sentence: &str) -> Vec<String> {
let mut terms = Vec::new();
for word in sentence.split_whitespace() {
let cleaned = word.trim_matches(|c: char| !c.is_alphanumeric() && c != '-' && c != '_');
if cleaned.is_empty() {
continue;
}
if is_technical_word(cleaned) {
terms.push(cleaned.to_string());
}
}
for window in sentence.split_whitespace().collect::<Vec<_>>().windows(3) {
if window.len() == 3 && window[1] == "-" {
let compound = format!("{}-{}", window[0], window[2]);
let cleaned = compound.trim_matches(|c: char| !c.is_alphanumeric() && c != '-');
if cleaned.len() > 3 {
terms.push(cleaned.to_string());
}
}
}
terms
}
fn is_technical_word(word: &str) -> bool {
if word.len() >= 2 && word.chars().all(|c| c.is_ascii_uppercase() || c.is_ascii_digit()) {
return true;
}
let has_mid_upper = word.chars().enumerate().any(|(i, c)| {
i > 0 && c.is_uppercase() && word.chars().nth(i - 1).is_some_and(|p| p.is_lowercase())
});
if has_mid_upper {
return true;
}
if word.contains('-') && word.len() > 5 {
return true;
}
if is_known_acronym(word) {
return true;
}
let lower = word.to_lowercase();
if TECH_SUFFIXES.iter().any(|s| lower.ends_with(s)) && word.len() > 6 {
return true;
}
false
}
fn is_known_acronym(word: &str) -> bool {
let lower = word.to_lowercase();
KNOWN_ACRONYMS.iter().any(|a| a.to_lowercase() == lower)
}
fn normalize_term(word: &str) -> String {
if word.chars().all(|c| c.is_ascii_uppercase() || c.is_ascii_digit()) {
word.to_string()
} else {
word.to_lowercase()
}
}
fn is_stop_word(word: &str) -> bool {
const STOP: &[&str] = &[
"the",
"a",
"an",
"is",
"are",
"was",
"were",
"be",
"been",
"being",
"have",
"has",
"had",
"do",
"does",
"did",
"will",
"would",
"could",
"should",
"may",
"might",
"can",
"shall",
"to",
"of",
"in",
"for",
"on",
"with",
"at",
"by",
"from",
"as",
"or",
"and",
"but",
"if",
"not",
"no",
"so",
"up",
"out",
"it",
"its",
"this",
"that",
"these",
"those",
"we",
"you",
"they",
"he",
"she",
"my",
"your",
"our",
"us",
"all",
"each",
"every",
"both",
"few",
"more",
"most",
"other",
"some",
"such",
"than",
"too",
"very",
"just",
"also",
"about",
"which",
"what",
"when",
"where",
"how",
"who",
"whom",
"why",
"into",
"through",
"during",
"before",
"after",
"above",
"below",
"between",
"same",
"different",
"then",
"there",
"here",
"new",
"old",
"many",
"much",
"own",
"only",
"well",
];
STOP.contains(&word.to_lowercase().as_str())
}
fn find_timestamp_for_sentence(
transcript: &TranscriptInput,
sentence_idx: usize,
sentences: &[String],
) -> String {
if transcript.segments.is_empty() {
return format!("sentence {}", sentence_idx + 1);
}
let target_sentence = &sentences[sentence_idx];
for seg in &transcript.segments {
if seg.text.contains(target_sentence.split_whitespace().next().unwrap_or(""))
|| target_sentence.contains(seg.text.split_whitespace().next().unwrap_or(""))
{
return format_timestamp(seg.start);
}
}
if let Some(last_seg) = transcript.segments.last() {
let ratio = sentence_idx as f64 / sentences.len().max(1) as f64;
let estimated_time = ratio * last_seg.end;
return format_timestamp(estimated_time);
}
format!("sentence {}", sentence_idx + 1)
}
fn categorize_term(term: &str) -> ConceptCategory {
let lower = term.to_lowercase();
if KNOWN_ACRONYMS.iter().any(|a| {
let al = a.to_lowercase();
al == lower
&& matches!(
al.as_str(),
"docker"
| "kubernetes"
| "k8s"
| "pytorch"
| "tensorflow"
| "numpy"
| "scipy"
| "pandas"
| "sklearn"
| "kafka"
| "huggingface"
| "mlflow"
)
}) {
return ConceptCategory::Tool;
}
let algo_keywords = [
"sort",
"search",
"gradient",
"descent",
"backprop",
"boosting",
"regression",
"classification",
"clustering",
"optimization",
"attention",
"convolution",
"pooling",
"softmax",
"normalization",
];
if algo_keywords.iter().any(|k| lower.contains(k)) {
return ConceptCategory::Algorithm;
}
let ds_keywords =
["tree", "graph", "array", "tensor", "matrix", "vector", "queue", "stack", "hash", "cache"];
if ds_keywords.iter().any(|k| lower.contains(k)) {
return ConceptCategory::DataStructure;
}
let metric_keywords = [
"accuracy",
"precision",
"recall",
"f1",
"loss",
"score",
"metric",
"perplexity",
"bleu",
"rouge",
"latency",
"throughput",
];
if metric_keywords.iter().any(|k| lower.contains(k)) {
return ConceptCategory::Metric;
}
let pattern_keywords = [
"pattern",
"pipeline",
"workflow",
"architecture",
"design",
"ops",
"devops",
"mlops",
"ci/cd",
"microservice",
];
if pattern_keywords.iter().any(|k| lower.contains(k)) {
return ConceptCategory::Pattern;
}
ConceptCategory::General
}
fn try_extract_definition(ctx: &str, lower_term: &str) -> Option<String> {
let lower_ctx = ctx.to_lowercase();
if let Some(pos) = lower_ctx.find(&format!("{} is ", lower_term)) {
let start = pos + lower_term.len() + 4;
if let Some(def) = ctx.get(start..) {
let end = def.find('.').unwrap_or(def.len()).min(120);
return Some(capitalize_first(safe_truncate_bytes(def, end).trim()));
}
}
if let Some(pos) = lower_ctx.find(&format!("{} refers to ", lower_term)) {
let start = pos + lower_term.len() + 11;
if let Some(def) = ctx.get(start..) {
let end = def.find('.').unwrap_or(def.len()).min(120);
return Some(capitalize_first(safe_truncate_bytes(def, end).trim()));
}
}
None
}
fn derive_definition(contexts: &[String], term: &str) -> String {
let lower_term = term.to_lowercase();
for ctx in contexts {
if let Some(def) = try_extract_definition(ctx, &lower_term) {
return def;
}
}
if let Some(first) = contexts.first() {
return if first.len() > 100 {
format!("{}...", safe_truncate_bytes(first, 100))
} else {
first.clone()
};
}
format!("Technical term: {term}")
}
fn capitalize_first(s: &str) -> String {
let mut chars = s.chars();
match chars.next() {
None => String::new(),
Some(c) => c.to_uppercase().to_string() + chars.as_str(),
}
}
fn safe_truncate_bytes(s: &str, max_bytes: usize) -> &str {
if max_bytes >= s.len() {
return s;
}
let mut end = max_bytes;
while end > 0 && !s.is_char_boundary(end) {
end -= 1;
}
&s[..end]
}
#[cfg(test)]
mod tests {
use super::*;
use crate::oracle::coursera::types::TranscriptSegment;
fn make_transcript(text: &str) -> TranscriptInput {
TranscriptInput {
text: text.to_string(),
language: "en".to_string(),
segments: vec![],
source_path: "test.txt".to_string(),
}
}
#[test]
fn test_extract_vocabulary_basic() {
let t = make_transcript(
"MLOps combines ML and DevOps. MLOps is the practice of deploying ML models. \
DevOps principles apply to ML workflows. API endpoints serve predictions. \
The API handles inference requests.",
);
let entries = extract_vocabulary(&[t]);
assert!(!entries.is_empty());
let mlops = entries.iter().find(|e| e.term.to_lowercase() == "mlops");
assert!(mlops.is_some(), "Should find MLOps");
assert!(mlops.expect("unexpected failure").frequency >= 2);
}
#[test]
fn test_extract_vocabulary_empty() {
let entries = extract_vocabulary(&[]);
assert!(entries.is_empty());
}
#[test]
fn test_extract_vocabulary_no_technical_terms() {
let t = make_transcript("The cat sat on the mat. It was a good day.");
let entries = extract_vocabulary(&[t]);
assert!(entries.is_empty());
}
#[test]
fn test_render_vocabulary_markdown() {
let entries = vec![VocabularyEntry {
term: "MLOps".to_string(),
definition: "Machine Learning Operations".to_string(),
first_occurrence: "0:05".to_string(),
frequency: 5,
category: ConceptCategory::Pattern,
}];
let md = render_vocabulary_markdown(&entries);
assert!(md.contains("# Course Vocabulary"));
assert!(md.contains("MLOps"));
assert!(md.contains("Machine Learning Operations"));
assert!(md.contains("Pattern"));
}
#[test]
fn test_render_vocabulary_markdown_empty() {
let md = render_vocabulary_markdown(&[]);
assert!(md.contains("No vocabulary terms extracted"));
}
#[test]
fn test_is_technical_word() {
assert!(is_technical_word("API"));
assert!(is_technical_word("MLOps"));
assert!(is_technical_word("DevOps"));
assert!(is_technical_word("pre-training"));
assert!(!is_technical_word("the"));
assert!(!is_technical_word("good"));
}
#[test]
fn test_categorize_term() {
assert_eq!(categorize_term("gradient descent"), ConceptCategory::Algorithm);
assert_eq!(categorize_term("tensor"), ConceptCategory::DataStructure);
assert_eq!(categorize_term("accuracy"), ConceptCategory::Metric);
assert_eq!(categorize_term("pipeline"), ConceptCategory::Pattern);
}
#[test]
fn test_split_sentences() {
let sentences = split_sentences("Hello world. How are you? Fine!");
assert_eq!(sentences.len(), 3);
}
#[test]
fn test_vocabulary_with_segments() {
let t = TranscriptInput {
text: "GPU acceleration is important. GPU kernels run SIMD operations. \
GPU computing enables parallel workloads."
.to_string(),
language: "en".to_string(),
segments: vec![
TranscriptSegment {
start: 0.0,
end: 5.0,
text: "GPU acceleration is important.".to_string(),
},
TranscriptSegment {
start: 5.0,
end: 10.0,
text: "GPU kernels run SIMD operations.".to_string(),
},
],
source_path: "lesson.json".to_string(),
};
let entries = extract_vocabulary(&[t]);
let gpu = entries.iter().find(|e| e.term == "GPU");
assert!(gpu.is_some());
}
#[test]
fn test_derive_definition_pattern() {
let contexts =
vec!["MLOps is the practice of deploying ML models in production.".to_string()];
let def = derive_definition(&contexts, "mlops");
assert!(def.contains("practice"), "Got: {def}");
}
#[test]
fn test_normalize_term() {
assert_eq!(normalize_term("API"), "API");
assert_eq!(normalize_term("DevOps"), "devops");
}
#[test]
fn test_derive_definition_refers_to() {
let contexts =
vec!["MLOps refers to the practice of deploying ML models in production.".to_string()];
let def = derive_definition(&contexts, "mlops");
assert!(def.contains("practice") || def.contains("deploying"), "Got: {def}");
}
#[test]
fn test_derive_definition_fallback_long_context() {
let long = format!("{}. More text follows.", "A".repeat(120));
let contexts = vec![long];
let def = derive_definition(&contexts, "nonexistentterm");
assert!(def.contains("..."), "Got: {def}");
}
#[test]
fn test_derive_definition_no_contexts() {
let def = derive_definition(&[], "SomeTerm");
assert_eq!(def, "Technical term: SomeTerm");
}
#[test]
fn test_categorize_term_tool() {
assert_eq!(categorize_term("Docker"), ConceptCategory::Tool);
assert_eq!(categorize_term("Kubernetes"), ConceptCategory::Tool);
assert_eq!(categorize_term("PyTorch"), ConceptCategory::Tool);
}
#[test]
fn test_hyphenated_compound_extraction() {
let terms = extract_candidate_terms("The cross - validation technique is used");
assert!(
terms.iter().any(|t| t.contains("cross") && t.contains("validation")),
"Terms: {:?}",
terms
);
}
#[test]
fn test_find_timestamp_estimated_fallback() {
let t = TranscriptInput {
text: "First sentence. Second sentence. Third sentence.".to_string(),
language: "en".to_string(),
segments: vec![
TranscriptSegment {
start: 0.0,
end: 10.0,
text: "Unrelated segment text here".to_string(),
},
TranscriptSegment {
start: 10.0,
end: 30.0,
text: "Another unrelated segment".to_string(),
},
],
source_path: "test.json".to_string(),
};
let sentences = split_sentences(&t.text);
let ts = find_timestamp_for_sentence(&t, 2, &sentences);
assert!(ts.contains(':'), "Expected timestamp, got: {ts}");
}
#[test]
fn test_capitalize_first_empty() {
assert_eq!(capitalize_first(""), "");
}
#[test]
fn test_safe_truncate_bytes_multibyte() {
let s = "café résumé";
let truncated = safe_truncate_bytes(s, 4);
assert!(!truncated.is_empty());
assert!(s.is_char_boundary(truncated.len()));
}
#[test]
fn test_stop_word_filtering() {
assert!(is_stop_word("the"));
assert!(is_stop_word("and"));
assert!(!is_stop_word("kubernetes"));
}
#[test]
fn test_extract_candidate_terms_empty_word() {
let terms = extract_candidate_terms("... --- *** plain text");
assert!(!terms.iter().any(|t| t == "..." || t == "---" || t == "***"),);
}
#[test]
fn test_derive_definition_short_context() {
let contexts = vec!["A brief context.".to_string()];
let def = derive_definition(&contexts, "xyzterm");
assert_eq!(def, "A brief context.");
}
}