use std::collections::HashMap;
use super::types::{CodeExample, Concept, ConceptCategory, KeyConceptsReading, TranscriptInput};
use super::vocabulary;
pub fn generate_key_concepts(transcript: &TranscriptInput) -> KeyConceptsReading {
let concepts = extract_concepts(transcript);
let code_examples = extract_code_examples(transcript, &concepts);
KeyConceptsReading { concepts, code_examples }
}
pub fn render_key_concepts_markdown(reading: &KeyConceptsReading) -> String {
let mut md = String::new();
md.push_str("# Key Concepts\n\n");
if reading.concepts.is_empty() {
md.push_str("No key concepts extracted from this transcript.\n");
return md;
}
let mut by_category: HashMap<&str, Vec<&Concept>> = HashMap::new();
for concept in &reading.concepts {
by_category.entry(concept.category.as_str()).or_default().push(concept);
}
let mut categories: Vec<&&str> = by_category.keys().collect();
categories.sort();
for cat in categories {
let cat_concepts = &by_category[*cat];
md.push_str(&format!("## {}\n\n", cat));
md.push_str("| Concept | Definition |\n");
md.push_str("|---------|------------|\n");
for concept in cat_concepts {
md.push_str(&format!("| **{}** | {} |\n", concept.term, concept.definition));
}
md.push('\n');
for concept in cat_concepts {
if !concept.context.is_empty() {
md.push_str(&format!("> *\"{}\"*\n\n", concept.context));
}
}
}
if !reading.code_examples.is_empty() {
md.push_str("## Code Examples\n\n");
for example in &reading.code_examples {
md.push_str(&format!(
"### {} ({})\n\n```{}\n{}\n```\n\n",
example.related_concept, example.language, example.language, example.code
));
}
}
md
}
fn extract_concepts(transcript: &TranscriptInput) -> Vec<Concept> {
let vocab = vocabulary::extract_vocabulary(std::slice::from_ref(transcript));
let sentences = split_sentences(&transcript.text);
let mut concepts: Vec<Concept> = Vec::new();
let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
for entry in &vocab {
let lower = entry.term.to_lowercase();
if seen.contains(&lower) {
continue;
}
seen.insert(lower.clone());
let context = find_best_context(&sentences, &entry.term);
let definition = if entry.definition.len() > 10 {
entry.definition.clone()
} else {
derive_concept_definition(&sentences, &entry.term)
};
concepts.push(Concept {
term: entry.term.clone(),
definition,
context,
category: entry.category,
});
}
for sentence in &sentences {
if let Some(concept) = try_extract_definition_pattern(sentence) {
let lower = concept.term.to_lowercase();
if !seen.contains(&lower) {
seen.insert(lower);
concepts.push(concept);
}
}
}
concepts.truncate(15);
concepts
}
fn find_best_context(sentences: &[String], term: &str) -> String {
let lower_term = term.to_lowercase();
for s in sentences {
let lower = s.to_lowercase();
if lower.contains(&lower_term) && (lower.contains(" is ") || lower.contains(" are ")) {
return truncate(s, 150);
}
}
for s in sentences {
if s.to_lowercase().contains(&lower_term) {
return truncate(s, 150);
}
}
String::new()
}
fn derive_concept_definition(sentences: &[String], term: &str) -> String {
let lower_term = term.to_lowercase();
for sentence in sentences {
let lower = sentence.to_lowercase();
if let Some(pos) = lower.find(&format!("{} is ", lower_term)) {
let start = pos + lower_term.len() + 4;
if let Some(def) = sentence.get(start..) {
let end = def.find('.').unwrap_or(def.len()).min(120);
return capitalize_first(safe_truncate_bytes(def, end).trim());
}
}
if let Some(pos) = lower.find(&format!("{}, also known as ", lower_term)) {
let start = pos + lower_term.len() + 17;
if let Some(def) = sentence.get(start..) {
let end = def.find('.').unwrap_or(def.len()).min(120);
return format!("Also known as {}", safe_truncate_bytes(def, end).trim());
}
}
}
format!("Technical concept: {term}")
}
fn try_extract_definition_pattern(sentence: &str) -> Option<Concept> {
let patterns = [" is a ", " is an ", " is the ", " refers to "];
let lower = sentence.to_lowercase();
patterns.iter().find_map(|pat| try_match_definition(sentence, &lower, pat))
}
fn try_match_definition(sentence: &str, lower: &str, pat: &str) -> Option<Concept> {
let pos = lower.find(pat)?;
let term = extract_term_before(sentence, pos);
if term.len() < 3 || term.chars().next().is_some_and(|c| c.is_lowercase()) {
return None;
}
let def_start = pos + pat.len();
let definition = sentence.get(def_start..)?;
let end = definition.find('.').unwrap_or(definition.len()).min(120);
let definition = capitalize_first(safe_truncate_bytes(definition, end).trim());
if definition.len() < 5 {
return None;
}
Some(Concept {
term: term.trim().to_string(),
definition,
context: truncate(sentence, 150),
category: ConceptCategory::General,
})
}
fn extract_term_before(sentence: &str, pos: usize) -> String {
sentence
.get(..pos)
.unwrap_or("")
.split_whitespace()
.rev()
.take(3)
.collect::<Vec<_>>()
.into_iter()
.rev()
.collect::<Vec<_>>()
.join(" ")
}
fn extract_code_examples(transcript: &TranscriptInput, concepts: &[Concept]) -> Vec<CodeExample> {
let mut examples = Vec::new();
let text_lower = transcript.text.to_lowercase();
extract_bash_examples(&text_lower, concepts, &mut examples);
extract_language_example(
&text_lower,
concepts,
&mut examples,
&["python", "import", "pytorch"],
"python",
&["python", "pytorch", "model"],
"Python",
"import torch\nmodel = torch.load(\"model.pt\")\noutput = model(input_tensor)",
);
extract_language_example(
&text_lower,
concepts,
&mut examples,
&["rust", "cargo", "trueno"],
"rust",
&["rust", "cargo", "trueno"],
"Rust",
"use trueno::Tensor;\nlet data = Tensor::from_slice(&[1.0, 2.0, 3.0]);\nlet result = data.matmul(&weights)?;",
);
examples.truncate(5);
examples
}
fn extract_bash_examples(text_lower: &str, concepts: &[Concept], examples: &mut Vec<CodeExample>) {
let bash_patterns: &[(&str, &str)] = &[
("docker", "docker run -p 8080:8080 model-server"),
("pip", "pip install torch transformers"),
("cargo", "cargo build --release"),
("kubectl", "kubectl apply -f deployment.yaml"),
("curl", "curl -X POST http://localhost:8080/predict -d '{\"input\": \"text\"}'"),
("git", "git clone https://github.com/org/repo.git"),
];
for (keyword, code) in bash_patterns {
if text_lower.contains(keyword) {
let related = find_related_concept(concepts, &[keyword])
.unwrap_or_else(|| (*keyword).to_string());
examples.push(CodeExample {
language: "bash".to_string(),
code: (*code).to_string(),
related_concept: related,
});
}
}
}
#[allow(clippy::too_many_arguments)]
fn extract_language_example(
text_lower: &str,
concepts: &[Concept],
examples: &mut Vec<CodeExample>,
triggers: &[&str],
language: &str,
concept_keywords: &[&str],
fallback_name: &str,
code: &str,
) {
if triggers.iter().any(|t| text_lower.contains(t)) {
let related = find_related_concept(concepts, concept_keywords)
.unwrap_or_else(|| fallback_name.to_string());
examples.push(CodeExample {
language: language.to_string(),
code: code.to_string(),
related_concept: related,
});
}
}
fn find_related_concept(concepts: &[Concept], keywords: &[&str]) -> Option<String> {
concepts
.iter()
.find(|c| {
let cl = c.term.to_lowercase();
keywords.iter().any(|kw| cl.contains(kw))
})
.map(|c| c.term.clone())
}
fn split_sentences(text: &str) -> Vec<String> {
let mut sentences = Vec::new();
let mut current = String::new();
for ch in text.chars() {
current.push(ch);
if matches!(ch, '.' | '!' | '?') {
let trimmed = current.trim().to_string();
if !trimmed.is_empty() {
sentences.push(trimmed);
}
current.clear();
}
}
let trimmed = current.trim().to_string();
if !trimmed.is_empty() {
sentences.push(trimmed);
}
sentences
}
fn truncate(s: &str, max: usize) -> String {
if s.len() <= max {
s.to_string()
} else {
format!("{}...", safe_truncate_bytes(s, max))
}
}
fn safe_truncate_bytes(s: &str, max_bytes: usize) -> &str {
if max_bytes >= s.len() {
return s;
}
let mut end = max_bytes;
while end > 0 && !s.is_char_boundary(end) {
end -= 1;
}
&s[..end]
}
fn capitalize_first(s: &str) -> String {
let mut chars = s.chars();
match chars.next() {
None => String::new(),
Some(c) => c.to_uppercase().to_string() + chars.as_str(),
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::oracle::coursera::types::TranscriptSegment;
fn make_transcript(text: &str) -> TranscriptInput {
TranscriptInput {
text: text.to_string(),
language: "en".to_string(),
segments: vec![],
source_path: "test.txt".to_string(),
}
}
#[test]
fn test_generate_key_concepts() {
let t = make_transcript(
"MLOps is the practice of deploying ML models to production. \
MLOps combines DevOps and machine learning workflows. \
CI/CD pipelines automate the deployment process. \
CI/CD is essential for reliable software delivery. \
GPU acceleration speeds up model inference. \
GPU computing enables parallel processing.",
);
let reading = generate_key_concepts(&t);
assert!(!reading.concepts.is_empty());
}
#[test]
fn test_generate_key_concepts_empty() {
let t = make_transcript("The cat sat on the mat.");
let reading = generate_key_concepts(&t);
assert!(reading.concepts.is_empty());
}
#[test]
fn test_render_key_concepts_markdown() {
let reading = KeyConceptsReading {
concepts: vec![Concept {
term: "MLOps".to_string(),
definition: "Machine Learning Operations".to_string(),
context: "MLOps combines ML and DevOps.".to_string(),
category: ConceptCategory::Pattern,
}],
code_examples: vec![CodeExample {
language: "bash".to_string(),
code: "docker run app".to_string(),
related_concept: "Docker".to_string(),
}],
};
let md = render_key_concepts_markdown(&reading);
assert!(md.contains("# Key Concepts"));
assert!(md.contains("MLOps"));
assert!(md.contains("## Code Examples"));
assert!(md.contains("```bash"));
}
#[test]
fn test_render_key_concepts_empty() {
let reading = KeyConceptsReading { concepts: vec![], code_examples: vec![] };
let md = render_key_concepts_markdown(&reading);
assert!(md.contains("No key concepts extracted"));
}
#[test]
fn test_extract_code_examples_bash() {
let t = make_transcript(
"We use docker to deploy our models. Docker containers are lightweight.",
);
let concepts = vec![Concept {
term: "Docker".to_string(),
definition: "Container runtime".to_string(),
context: "".to_string(),
category: ConceptCategory::Tool,
}];
let examples = extract_code_examples(&t, &concepts);
assert!(!examples.is_empty());
assert_eq!(examples[0].language, "bash");
}
#[test]
fn test_extract_code_examples_python() {
let t = make_transcript("Python and PyTorch are used for model training. Python scripts handle data processing.");
let concepts = vec![];
let examples = extract_code_examples(&t, &concepts);
let python_example = examples.iter().find(|e| e.language == "python");
assert!(python_example.is_some());
}
#[test]
fn test_extract_code_examples_rust() {
let t = make_transcript(
"Rust and cargo are used for high-performance computing. Rust provides memory safety.",
);
let concepts = vec![];
let examples = extract_code_examples(&t, &concepts);
let rust_example = examples.iter().find(|e| e.language == "rust");
assert!(rust_example.is_some());
}
#[test]
fn test_try_extract_definition_pattern() {
let result = try_extract_definition_pattern(
"Batch Normalization is a technique that normalizes layer inputs.",
);
assert!(result.is_some());
let concept = result.expect("operation failed");
assert!(concept.term.contains("Normalization"));
}
#[test]
fn test_duplicate_terms_deduplicated() {
let t = make_transcript(
"MLOps is the practice of deploying ML models. MLOps automates deployment. \
MLOps combines DevOps and ML. MLOps pipelines handle continuous delivery. \
MLOps teams build reliable systems.",
);
let reading = generate_key_concepts(&t);
let mlops_count =
reading.concepts.iter().filter(|c| c.term.to_lowercase() == "mlops").count();
assert!(mlops_count <= 1, "MLOps should appear at most once");
}
#[test]
fn test_derive_concept_definition_is_pattern() {
let sentences =
vec!["Kubernetes is an open-source container orchestration platform.".to_string()];
let def = super::derive_concept_definition(&sentences, "Kubernetes");
assert!(def.contains("open-source") || def.contains("container"), "Got: {def}");
}
#[test]
fn test_derive_concept_definition_also_known_as() {
let sentences = vec!["K8s, also known as Kubernetes container orchestration.".to_string()];
let def = super::derive_concept_definition(&sentences, "K8s");
assert!(def.starts_with("Also known as"), "Got: {def}");
}
#[test]
fn test_derive_concept_definition_fallback() {
let sentences = vec!["Random text about something.".to_string()];
let def = super::derive_concept_definition(&sentences, "QUIC");
assert!(def.contains("Technical concept: QUIC"), "Got: {def}");
}
#[test]
fn test_find_best_context_no_match() {
let sentences = vec!["The cat sat on the mat.".to_string()];
let ctx = super::find_best_context(&sentences, "kubernetes");
assert!(ctx.is_empty());
}
#[test]
fn test_try_match_definition_short_term_rejected() {
let result = super::try_match_definition("It is a test.", "it is a test.", " is a ");
assert!(result.is_none());
}
#[test]
fn test_try_match_definition_short_definition_rejected() {
let result = try_extract_definition_pattern("BigThing is a ok.");
assert!(result.is_none());
}
#[test]
fn test_truncate_long_string() {
let long = "a".repeat(200);
let result = super::truncate(&long, 50);
assert!(result.ends_with("..."));
assert!(result.len() <= 54); }
#[test]
fn test_safe_truncate_bytes_multibyte() {
let s = "héllo wörld";
let truncated = super::safe_truncate_bytes(s, 3);
assert!(!truncated.is_empty());
assert!(s.is_char_boundary(truncated.len()));
}
#[test]
fn test_capitalize_first_empty() {
assert_eq!(super::capitalize_first(""), "");
}
#[test]
fn test_split_sentences_trailing_text() {
let sentences = super::split_sentences("Hello world. This has no period");
assert_eq!(sentences.len(), 2);
assert_eq!(sentences[1], "This has no period");
}
#[test]
fn test_definition_pattern_refers_to() {
let result =
try_extract_definition_pattern("MLOps refers to the practice of operationalizing ML.");
assert!(result.is_some());
let concept = result.expect("operation failed");
assert!(
concept.definition.contains("practice")
|| concept.definition.contains("operationalizing")
);
}
#[test]
fn test_concepts_with_segments() {
let t = TranscriptInput {
text: "API endpoints serve ML predictions. The API handles inference. \
GPU acceleration is critical. GPU kernels run fast."
.to_string(),
language: "en".to_string(),
segments: vec![TranscriptSegment {
start: 0.0,
end: 10.0,
text: "API endpoints serve ML predictions.".to_string(),
}],
source_path: "test.json".to_string(),
};
let reading = generate_key_concepts(&t);
assert!(!reading.concepts.is_empty());
}
}