Skip to main content

tldr_cli/commands/patterns/
temporal.rs

1//! Temporal Command - Temporal Constraint Mining
2//!
3//! Mines temporal constraints (method call sequences) from a codebase.
4//!
5//! # Algorithm
6//!
7//! 1. Extract method call sequences from each function
8//! 2. Build frequency table of (before, after) pairs (bigrams)
9//! 3. Calculate confidence: count(A->B) / count(A)
10//! 4. Filter by min_support and min_confidence
11//! 5. Optionally mine trigrams (3-method sequences)
12//!
13//! # TIGER Mitigations
14//!
15//! - **T05**: MAX_TRIGRAMS=10000 with BinaryHeap top-K selection
16//! - **E03**: --timeout flag (default 60s)
17//!
18//! # Example
19//!
20//! ```bash
21//! # Mine constraints from a directory
22//! tldr temporal src/ --min-support 2 --min-confidence 0.5
23//!
24//! # Filter for specific method
25//! tldr temporal src/ --query open
26//!
27//! # Include trigram patterns
28//! tldr temporal src/ --include-trigrams
29//! ```
30
31use std::cmp::Reverse;
32use std::collections::{BinaryHeap, HashMap};
33use std::path::{Path, PathBuf};
34use std::time::{Duration, Instant};
35
36use clap::Args;
37use tree_sitter::{Node, Parser};
38
39use tldr_core::callgraph::{
40    build_project_call_graph_v2, extract_calls_for_language, BuildConfig, CallSite,
41};
42use tldr_core::types::Language;
43
44use crate::output::OutputFormat as GlobalOutputFormat;
45
46use super::error::{PatternsError, PatternsResult};
47use super::types::{
48    OutputFormat, TemporalConstraint, TemporalExample, TemporalMetadata, TemporalReport, Trigram,
49};
50use super::validation::{
51    check_directory_file_count, read_file_safe, validate_directory_path, validate_file_path,
52    validate_file_path_in_project, MAX_TRIGRAMS,
53};
54
55// =============================================================================
56// CLI Arguments
57// =============================================================================
58
59/// Mine temporal constraints (method call sequences) from a codebase.
60#[derive(Debug, Args)]
61pub struct TemporalArgs {
62    /// Directory or file to analyze
63    pub path: PathBuf,
64
65    /// Minimum occurrences for a pattern
66    #[arg(long, default_value = "2")]
67    pub min_support: u32,
68
69    /// Minimum confidence threshold (0.0-1.0)
70    #[arg(long, default_value = "0.5")]
71    pub min_confidence: f64,
72
73    /// Filter for specific method
74    #[arg(long)]
75    pub query: Option<String>,
76
77    /// Source language hint (legacy; prefer the global `--lang/-l` flag).
78    /// Accepts any of the 18 TLDR languages or `auto` for autodetect.
79    #[arg(long = "source-lang", default_value = "python")]
80    pub source_lang: String,
81
82    /// Maximum files to analyze
83    #[arg(long, default_value = "1000")]
84    pub max_files: u32,
85
86    /// Mine 3-method sequences
87    #[arg(long)]
88    pub include_trigrams: bool,
89
90    /// Number of examples per constraint
91    #[arg(long, default_value = "3")]
92    pub include_examples: u32,
93
94    /// Output format (json or text). Prefer global --format/-f flag.
95    #[arg(
96        long = "output",
97        short = 'o',
98        hide = true,
99        default_value = "json",
100        value_enum
101    )]
102    pub output_format: OutputFormat,
103
104    /// Timeout in seconds (E03 mitigation)
105    #[arg(long, default_value = "60")]
106    pub timeout: u64,
107
108    /// Project root for path validation (optional)
109    #[arg(long)]
110    pub project_root: Option<PathBuf>,
111
112    /// Language filter (auto-detected if omitted)
113    #[arg(long, short = 'l')]
114    pub lang: Option<Language>,
115}
116
117impl TemporalArgs {
118    /// Run the temporal analysis command
119    pub fn run(&self, global_format: GlobalOutputFormat) -> anyhow::Result<()> {
120        run(self.clone(), global_format)
121    }
122}
123
124impl Clone for TemporalArgs {
125    fn clone(&self) -> Self {
126        Self {
127            path: self.path.clone(),
128            min_support: self.min_support,
129            min_confidence: self.min_confidence,
130            query: self.query.clone(),
131            source_lang: self.source_lang.clone(),
132            max_files: self.max_files,
133            include_trigrams: self.include_trigrams,
134            include_examples: self.include_examples,
135            output_format: self.output_format,
136            timeout: self.timeout,
137            project_root: self.project_root.clone(),
138            lang: self.lang,
139        }
140    }
141}
142
143// =============================================================================
144// Sequence Extraction
145// =============================================================================
146
147/// Extractor for method call sequences from source code.
148#[derive(Debug, Default)]
149pub struct SequenceExtractor {
150    /// Current function being analyzed
151    current_function: String,
152    /// Extracted sequences: object_key -> list of method names
153    sequences: HashMap<String, Vec<String>>,
154    /// Variable assignments: variable -> assigned from (for tracking objects)
155    var_assignments: HashMap<String, String>,
156    /// Current line number
157    current_line: u32,
158}
159
160impl SequenceExtractor {
161    /// Create a new sequence extractor for a file
162    pub fn new() -> Self {
163        Self::default()
164    }
165
166    /// Extract sequences from a function node
167    pub fn extract_function(&mut self, func_node: Node, source: &[u8]) {
168        // Get function name
169        let func_name = self.get_function_name(func_node, source);
170        if func_name.is_empty() {
171            return;
172        }
173        self.current_function = func_name;
174        self.var_assignments.clear();
175
176        // Walk the function body and extract call sequences
177        self.extract_calls_recursive(func_node, source, 0);
178    }
179
180    /// Recursively extract method calls from AST nodes
181    fn extract_calls_recursive(&mut self, node: Node, source: &[u8], depth: usize) {
182        // Prevent stack overflow
183        if depth > 100 {
184            return;
185        }
186
187        self.current_line = node.start_position().row as u32 + 1;
188
189        match node.kind() {
190            // Track assignments: x = open(...) or x = something.method()
191            "assignment" => {
192                self.handle_assignment(node, source);
193            }
194
195            // Track method calls: x.read(), x.close(), etc.
196            "call" => {
197                self.handle_call(node, source);
198            }
199
200            // Track with statements: with open(...) as f
201            "with_statement" => {
202                self.handle_with_statement(node, source);
203            }
204
205            _ => {}
206        }
207
208        // Recurse into children
209        let mut cursor = node.walk();
210        for child in node.children(&mut cursor) {
211            self.extract_calls_recursive(child, source, depth + 1);
212        }
213    }
214
215    /// Handle an assignment statement
216    fn handle_assignment(&mut self, node: Node, source: &[u8]) {
217        // Get the left side (variable name)
218        let var_name = if let Some(left) = node.child_by_field_name("left") {
219            self.node_text(left, source).to_string()
220        } else {
221            // Try to find pattern targets (for simple assignments)
222            let mut var = String::new();
223            for child in node.children(&mut node.walk()) {
224                if child.kind() == "identifier" {
225                    var = self.node_text(child, source).to_string();
226                    break;
227                }
228            }
229            var
230        };
231
232        if var_name.is_empty() {
233            return;
234        }
235
236        // Get the right side (value)
237        if let Some(right) = node.child_by_field_name("right") {
238            // Check if it's a call expression
239            if right.kind() == "call" {
240                let call_name = self.extract_call_name(right, source);
241                if !call_name.is_empty() {
242                    // Track the assignment: var_name was assigned from call_name
243                    self.var_assignments
244                        .insert(var_name.clone(), call_name.clone());
245
246                    // Add to sequence: func:var -> [constructor_call]
247                    let key = format!("{}:{}", self.current_function, var_name);
248                    self.sequences.entry(key).or_default().push(call_name);
249                }
250            }
251        }
252    }
253
254    /// Handle a call expression
255    fn handle_call(&mut self, node: Node, source: &[u8]) {
256        // Extract the call structure: object.method() or function()
257        if let Some(func) = node.child_by_field_name("function") {
258            if func.kind() == "attribute" {
259                // Method call: obj.method()
260                if let Some(obj) = func.child_by_field_name("object") {
261                    let obj_name = self.node_text(obj, source).to_string();
262                    if let Some(method) = func.child_by_field_name("attribute") {
263                        let method_name = self.node_text(method, source).to_string();
264
265                        // Add to sequence for this object
266                        let key = format!("{}:{}", self.current_function, obj_name);
267                        self.sequences.entry(key).or_default().push(method_name);
268                    }
269                }
270            }
271        }
272    }
273
274    /// Handle a with statement
275    fn handle_with_statement(&mut self, node: Node, source: &[u8]) {
276        // Extract: with open(path) as f
277        for child in node.children(&mut node.walk()) {
278            if child.kind() == "with_clause" {
279                for item in child.children(&mut child.walk()) {
280                    if item.kind() == "with_item" {
281                        // Get the expression (open(...))
282                        let mut call_name = String::new();
283                        let mut var_name = String::new();
284
285                        for part in item.children(&mut item.walk()) {
286                            if part.kind() == "call" {
287                                call_name = self.extract_call_name(part, source);
288                            } else if part.kind() == "as_pattern" || part.kind() == "identifier" {
289                                // Get the alias
290                                if part.kind() == "identifier" {
291                                    var_name = self.node_text(part, source).to_string();
292                                } else {
293                                    for as_child in part.children(&mut part.walk()) {
294                                        if as_child.kind() == "identifier" {
295                                            var_name = self.node_text(as_child, source).to_string();
296                                            break;
297                                        }
298                                    }
299                                }
300                            }
301                        }
302
303                        if !call_name.is_empty() && !var_name.is_empty() {
304                            let key = format!("{}:{}", self.current_function, var_name);
305                            self.sequences
306                                .entry(key.clone())
307                                .or_default()
308                                .push(call_name);
309                            // with statement implies automatic close
310                            self.sequences
311                                .entry(key)
312                                .or_default()
313                                .push("__exit__".to_string());
314                        }
315                    }
316                }
317            }
318        }
319    }
320
321    /// Extract the call name from a call node
322    fn extract_call_name(&self, node: Node, source: &[u8]) -> String {
323        if let Some(func) = node.child_by_field_name("function") {
324            return self.extract_name_from_expr(func, source);
325        }
326
327        // Fallback: iterate children
328        for child in node.children(&mut node.walk()) {
329            match child.kind() {
330                "identifier" => return self.node_text(child, source).to_string(),
331                "attribute" => return self.extract_name_from_expr(child, source),
332                _ => continue,
333            }
334        }
335        String::new()
336    }
337
338    /// Extract a dotted name from an expression
339    fn extract_name_from_expr(&self, node: Node, source: &[u8]) -> String {
340        match node.kind() {
341            "identifier" => self.node_text(node, source).to_string(),
342            "attribute" => {
343                // Get just the last part (method name)
344                if let Some(attr) = node.child_by_field_name("attribute") {
345                    self.node_text(attr, source).to_string()
346                } else {
347                    String::new()
348                }
349            }
350            _ => self.node_text(node, source).to_string(),
351        }
352    }
353
354    /// Get function name from a function definition
355    fn get_function_name(&self, node: Node, source: &[u8]) -> String {
356        for child in node.children(&mut node.walk()) {
357            if child.kind() == "identifier" {
358                return self.node_text(child, source).to_string();
359            }
360        }
361        String::new()
362    }
363
364    /// Get text for a node
365    fn node_text<'a>(&self, node: Node, source: &'a [u8]) -> &'a str {
366        node.utf8_text(source).unwrap_or("")
367    }
368
369    /// Get extracted sequences
370    pub fn get_sequences(&self) -> &HashMap<String, Vec<String>> {
371        &self.sequences
372    }
373}
374
375/// Extract method call sequences from source code
376pub fn extract_sequences(source: &str) -> HashMap<String, Vec<String>> {
377    let mut extractor = SequenceExtractor::new();
378
379    // Parse with tree-sitter
380    let mut parser = match get_python_parser() {
381        Ok(p) => p,
382        Err(_) => return HashMap::new(),
383    };
384
385    let tree = match parser.parse(source, None) {
386        Some(t) => t,
387        None => return HashMap::new(),
388    };
389
390    let root = tree.root_node();
391    let source_bytes = source.as_bytes();
392
393    // Find all function definitions and extract sequences
394    extract_functions_recursive(root, source_bytes, &mut extractor);
395
396    extractor.sequences
397}
398
399/// Recursively find function definitions and extract sequences
400fn extract_functions_recursive(node: Node, source: &[u8], extractor: &mut SequenceExtractor) {
401    match node.kind() {
402        "function_definition" | "async_function_definition" => {
403            extractor.extract_function(node, source);
404        }
405        _ => {}
406    }
407
408    // Recurse into children
409    let mut cursor = node.walk();
410    for child in node.children(&mut cursor) {
411        extract_functions_recursive(child, source, extractor);
412    }
413}
414
415// =============================================================================
416// Generalized per-language sequence extraction (VAL-016)
417// =============================================================================
418//
419// `extract_sequences` (above) is the historical Python-AST walker that tracks
420// receiver-aware variable lifetimes (e.g. `f = open(...); f.read(); f.close()`
421// emits `[open, read, close]` keyed by `func:f`). It is kept for backward
422// compatibility on Python and for the bespoke `with`/`__exit__` modelling
423// the call-graph IR doesn't currently express.
424//
425// For the other 17 languages we reuse each language's call-graph handler,
426// which already extracts per-method `Vec<CallSite>` with line numbers (see
427// `tldr_core::callgraph::cross_file_types::FileIR::calls`). Sorting that
428// list by line yields the temporal sequence per scope; each sequence is
429// keyed by the qualifying caller name so cross-method calls don't bleed
430// into one another.
431
432/// Convert per-caller CallSite lists into temporal sequences keyed by
433/// `<file>::<caller>`. Each sequence is sorted by line number so the
434/// resulting order matches source-order method dispatch.
435///
436/// Receiver-prefixing rule: when a CallSite has a `receiver`, the
437/// sequence entry is the bare `target` (the method name) — receiver
438/// information is intentionally dropped because temporal mining is
439/// interested in *which method* runs, not the variable it ran on.
440/// This keeps the bigram alphabet finite across runs.
441fn sequences_from_callsite_map(
442    file_key: &str,
443    calls_by_func: &HashMap<String, Vec<CallSite>>,
444) -> HashMap<String, Vec<String>> {
445    let mut out: HashMap<String, Vec<String>> = HashMap::new();
446    for (caller, sites) in calls_by_func {
447        if sites.is_empty() {
448            continue;
449        }
450        // Sort a copy by line (ascending). Sites without a line sink to
451        // the end and preserve their relative order for determinism.
452        let mut ordered = sites.clone();
453        ordered.sort_by_key(|s| s.line.unwrap_or(u32::MAX));
454
455        let names: Vec<String> = ordered
456            .into_iter()
457            .map(|s| s.target)
458            .filter(|t| !t.is_empty())
459            .collect();
460
461        if names.is_empty() {
462            continue;
463        }
464        let key = format!("{}::{}", file_key, caller);
465        out.insert(key, names);
466    }
467    out
468}
469
470/// Result of extracting sequences for a single file: the per-caller
471/// sequences and the example-line of the first call site (used to seed
472/// `TemporalExample.line`).
473struct FileSequences {
474    sequences: HashMap<String, Vec<String>>,
475    /// First line per (caller, target_pair) — used to give each bigram
476    /// a real line number rather than the legacy hard-coded `1`.
477    first_line: HashMap<(String, String, String), u32>,
478}
479
480/// Extract sequences for a single source file using the language-aware
481/// call-graph handler.
482///
483/// For Python, this *combines* two sources:
484///   1. The legacy receiver-aware AST walker (`extract_sequences`),
485///      which produces `<func>:<var>` keyed sequences for patterns
486///      like `f = open(...); f.read(); f.close()` plus the bespoke
487///      `with` / `__exit__` modelling. This is the historical
488///      behaviour and is required for the `open -> read -> close`
489///      resource-lifecycle bigrams the temporal command was originally
490///      designed to mine.
491///   2. The call-graph handler (`extract_calls_for_language`), which
492///      captures bare calls like `helper(); b_util()` keyed by
493///      `<file>::<caller>`. The legacy walker does NOT capture these.
494///
495/// The two key namespaces are disjoint (`:` vs `::`) so they coexist.
496/// For the other 17 languages only the call-graph path runs.
497fn extract_sequences_for_file(
498    path: &Path,
499    source: &str,
500    language: Language,
501) -> PatternsResult<FileSequences> {
502    let file_key = path.to_string_lossy().to_string();
503
504    let mut sequences: HashMap<String, Vec<String>> = HashMap::new();
505    let mut first_line: HashMap<(String, String, String), u32> = HashMap::new();
506
507    // Python regression path — preserve legacy receiver-aware sequences.
508    if language == Language::Python {
509        let legacy = extract_sequences(source);
510        for (k, v) in legacy {
511            // The legacy walker uses `<func>:<var>` keys (single colon).
512            sequences.entry(k).or_default().extend(v);
513        }
514    }
515
516    // Call-graph handler path — runs for all 18 languages, picking up
517    // bare calls plus method/attribute calls in source order.
518    let lang_str = language.as_str();
519    let calls_by_func = match extract_calls_for_language(lang_str, path, source) {
520        Ok(map) => map,
521        Err(_) => {
522            // OCaml is the only language that hits this fallback (the
523            // single-file extractor doesn't currently expose its
524            // tree-sitter language). The directory analyzer routes
525            // OCaml through `build_project_call_graph_v2` instead so
526            // returning whatever sequences we have so far is correct.
527            return Ok(FileSequences {
528                sequences,
529                first_line,
530            });
531        }
532    };
533
534    let scoped = sequences_from_callsite_map(&file_key, &calls_by_func);
535    for (k, v) in scoped {
536        sequences.entry(k).or_default().extend(v);
537    }
538
539    // Build the (caller, before, after) -> first_line lookup so bigram
540    // examples carry an accurate line for non-Python sequences.
541    for (caller, sites) in &calls_by_func {
542        let mut ordered = sites.clone();
543        ordered.sort_by_key(|s| s.line.unwrap_or(u32::MAX));
544        for pair in ordered.windows(2) {
545            let before = pair[0].target.clone();
546            let after = pair[1].target.clone();
547            if before.is_empty() || after.is_empty() || before == after {
548                continue;
549            }
550            let line = pair[1].line.unwrap_or(1);
551            first_line
552                .entry((caller.clone(), before, after))
553                .or_insert(line);
554        }
555    }
556
557    Ok(FileSequences {
558        sequences,
559        first_line,
560    })
561}
562
563/// Detect the language for a directory. Falls back to `args.lang` if
564/// auto-detection returns nothing.
565fn resolve_directory_language(path: &Path, args: &TemporalArgs) -> Option<Language> {
566    if let Some(lang) = args.lang {
567        return Some(lang);
568    }
569    Language::from_directory(path)
570}
571
572/// Build a `(caller, before, after) -> first-line` lookup from the
573/// call-graph IR's per-caller CallSite lists. This is the project-wide
574/// counterpart to `extract_sequences_for_file::first_line` so OCaml
575/// bigram examples carry an accurate line.
576fn per_caller_first_line(
577    calls_by_func: &HashMap<String, Vec<CallSite>>,
578) -> HashMap<(String, String, String), u32> {
579    let mut first_line: HashMap<(String, String, String), u32> = HashMap::new();
580    for (caller, sites) in calls_by_func {
581        let mut ordered = sites.clone();
582        ordered.sort_by_key(|s| s.line.unwrap_or(u32::MAX));
583        for pair in ordered.windows(2) {
584            let before = pair[0].target.clone();
585            let after = pair[1].target.clone();
586            if before.is_empty() || after.is_empty() || before == after {
587                continue;
588            }
589            let line = pair[1].line.unwrap_or(1);
590            first_line
591                .entry((caller.clone(), before, after))
592                .or_insert(line);
593        }
594    }
595    first_line
596}
597
598/// Aggregate one file's sequences into the directory-wide accumulator.
599/// Counts bigrams, tracks before/after totals for confidence, and
600/// records up to `args.include_examples` example sites per pair.
601#[allow(clippy::too_many_arguments)]
602fn aggregate_file_sequences(
603    file_sequences: &HashMap<String, Vec<String>>,
604    file_path_str: &str,
605    first_line: &HashMap<(String, String, String), u32>,
606    all_sequences: &mut HashMap<String, Vec<String>>,
607    bigram_counts: &mut HashMap<(String, String), u32>,
608    before_counts: &mut HashMap<String, u32>,
609    all_examples: &mut HashMap<(String, String), Vec<TemporalExample>>,
610    args: &TemporalArgs,
611) {
612    for (key, calls) in file_sequences {
613        all_sequences
614            .entry(key.clone())
615            .or_default()
616            .extend(calls.clone());
617
618        // Recover the caller name from the sequence key for line
619        // lookup. Keys produced by `sequences_from_callsite_map` are
620        // `<file>::<caller>`; Python's legacy keys are `<func>:<var>`.
621        // For the Python case the `first_line` map is empty so we fall
622        // back to line=1 (preserving prior CLI output exactly).
623        let caller_for_lookup = key
624            .rsplit_once("::")
625            .map(|(_, c)| c.to_string())
626            .unwrap_or_default();
627
628        for i in 0..calls.len().saturating_sub(1) {
629            let before = &calls[i];
630            let after = &calls[i + 1];
631
632            if before == after {
633                continue;
634            }
635
636            let pair = (before.clone(), after.clone());
637            *bigram_counts.entry(pair.clone()).or_default() += 1;
638            *before_counts.entry(before.clone()).or_default() += 1;
639
640            // Track examples
641            let examples = all_examples.entry(pair).or_default();
642            if examples.len() < args.include_examples as usize {
643                let line = first_line
644                    .get(&(caller_for_lookup.clone(), before.clone(), after.clone()))
645                    .copied()
646                    .unwrap_or(1);
647                examples.push(TemporalExample {
648                    file: file_path_str.to_string(),
649                    line,
650                });
651            }
652        }
653    }
654}
655
656// =============================================================================
657// Bigram Mining
658// =============================================================================
659
660/// Counter for bigrams with example tracking
661#[derive(Debug, Default)]
662pub struct BigramCounter {
663    /// Bigram counts: (before, after) -> count
664    pub counts: HashMap<(String, String), u32>,
665    /// Before counts: method -> count of times it's followed by something
666    pub before_counts: HashMap<String, u32>,
667    /// Example locations: (before, after) -> list of (file, line)
668    pub examples: HashMap<(String, String), Vec<TemporalExample>>,
669}
670
671impl BigramCounter {
672    /// Create a new bigram counter
673    pub fn new() -> Self {
674        Self::default()
675    }
676
677    /// Add sequences from extraction
678    pub fn add_sequences(&mut self, sequences: &HashMap<String, Vec<String>>, file: &str) {
679        for calls in sequences.values() {
680            // Parse function name from key (func:var)
681            let line = 1u32; // Would need more tracking for accurate line numbers
682
683            for i in 0..calls.len().saturating_sub(1) {
684                let before = &calls[i];
685                let after = &calls[i + 1];
686
687                // Skip self-loops
688                if before == after {
689                    continue;
690                }
691
692                let pair = (before.clone(), after.clone());
693
694                // Increment bigram count
695                *self.counts.entry(pair.clone()).or_default() += 1;
696
697                // Increment before count
698                *self.before_counts.entry(before.clone()).or_default() += 1;
699
700                // Add example
701                self.examples
702                    .entry(pair)
703                    .or_default()
704                    .push(TemporalExample {
705                        file: file.to_string(),
706                        line,
707                    });
708            }
709        }
710    }
711}
712
713/// Mine bigram constraints from sequences
714pub fn mine_bigrams(
715    sequences: &HashMap<String, Vec<String>>,
716    file: &str,
717    args: &TemporalArgs,
718) -> (BigramCounter, Vec<TemporalConstraint>) {
719    let mut counter = BigramCounter::new();
720    counter.add_sequences(sequences, file);
721
722    let mut constraints = Vec::new();
723
724    for ((before, after), count) in &counter.counts {
725        // Filter by min_support
726        if *count < args.min_support {
727            continue;
728        }
729
730        // Calculate confidence
731        let before_total = *counter.before_counts.get(before).unwrap_or(&1);
732        let confidence = (*count as f64) / (before_total as f64);
733
734        // Filter by min_confidence
735        if confidence < args.min_confidence {
736            continue;
737        }
738
739        // Get examples (limited)
740        let examples = counter
741            .examples
742            .get(&(before.clone(), after.clone()))
743            .map(|ex| {
744                ex.iter()
745                    .take(args.include_examples as usize)
746                    .cloned()
747                    .collect()
748            })
749            .unwrap_or_default();
750
751        constraints.push(TemporalConstraint {
752            before: before.clone(),
753            after: after.clone(),
754            support: *count,
755            confidence,
756            examples,
757        });
758    }
759
760    // Sort by confidence (descending), then support (descending)
761    constraints.sort_by(|a, b| {
762        b.confidence
763            .partial_cmp(&a.confidence)
764            .unwrap_or(std::cmp::Ordering::Equal)
765            .then_with(|| b.support.cmp(&a.support))
766    });
767
768    (counter, constraints)
769}
770
771// =============================================================================
772// Trigram Mining (TIGER-05: MAX_TRIGRAMS limit)
773// =============================================================================
774
775/// Mine trigram patterns with MAX_TRIGRAMS limit (TIGER-05)
776pub fn mine_trigrams(
777    sequences: &HashMap<String, Vec<String>>,
778    args: &TemporalArgs,
779) -> Vec<Trigram> {
780    // Count trigrams
781    let mut trigram_counts: HashMap<(String, String, String), u32> = HashMap::new();
782    let mut bigram_follows: HashMap<(String, String), u32> = HashMap::new();
783
784    for calls in sequences.values() {
785        for i in 0..calls.len().saturating_sub(2) {
786            let a = &calls[i];
787            let b = &calls[i + 1];
788            let c = &calls[i + 2];
789
790            // Skip if any self-loops
791            if a == b || b == c {
792                continue;
793            }
794
795            *trigram_counts
796                .entry((a.clone(), b.clone(), c.clone()))
797                .or_default() += 1;
798
799            // Count bigram follows
800            if a != b {
801                *bigram_follows.entry((a.clone(), b.clone())).or_default() += 1;
802            }
803        }
804    }
805
806    // TIGER-05: Use BinaryHeap for top-K selection to limit memory
807    // We use a min-heap of size MAX_TRIGRAMS, keeping the largest support values
808    let mut heap: BinaryHeap<Reverse<(u32, String, String, String)>> = BinaryHeap::new();
809
810    for ((a, b, c), count) in &trigram_counts {
811        if *count < args.min_support {
812            continue;
813        }
814
815        // Calculate confidence
816        let bigram_total = *bigram_follows.get(&(a.clone(), b.clone())).unwrap_or(&1);
817        let confidence = (*count as f64) / (bigram_total as f64);
818
819        if confidence < args.min_confidence {
820            continue;
821        }
822
823        // Add to heap with support as priority
824        if heap.len() < MAX_TRIGRAMS {
825            heap.push(Reverse((*count, a.clone(), b.clone(), c.clone())));
826        } else if let Some(&Reverse((min_support, _, _, _))) = heap.peek() {
827            if *count > min_support {
828                heap.pop();
829                heap.push(Reverse((*count, a.clone(), b.clone(), c.clone())));
830            }
831        }
832    }
833
834    // Convert heap to sorted vector
835    let mut trigrams: Vec<Trigram> = heap
836        .into_iter()
837        .map(|Reverse((support, a, b, c))| {
838            let bigram_total = *bigram_follows.get(&(a.clone(), b.clone())).unwrap_or(&1);
839            let confidence = (support as f64) / (bigram_total as f64);
840
841            Trigram {
842                sequence: [a, b, c],
843                support,
844                confidence,
845            }
846        })
847        .collect();
848
849    // Sort by confidence (descending), then support (descending)
850    trigrams.sort_by(|a, b| {
851        b.confidence
852            .partial_cmp(&a.confidence)
853            .unwrap_or(std::cmp::Ordering::Equal)
854            .then_with(|| b.support.cmp(&a.support))
855    });
856
857    trigrams
858}
859
860// =============================================================================
861// Query Filtering
862// =============================================================================
863
864/// Filter constraints by query string
865pub fn filter_by_query(
866    constraints: Vec<TemporalConstraint>,
867    query: &str,
868) -> Vec<TemporalConstraint> {
869    constraints
870        .into_iter()
871        .filter(|c| c.before.contains(query) || c.after.contains(query))
872        .collect()
873}
874
875/// Filter trigrams by query string
876pub fn filter_trigrams_by_query(trigrams: Vec<Trigram>, query: &str) -> Vec<Trigram> {
877    trigrams
878        .into_iter()
879        .filter(|t| t.sequence.iter().any(|s| s.contains(query)))
880        .collect()
881}
882
883// =============================================================================
884// Tree-sitter Parser
885// =============================================================================
886
887/// Initialize tree-sitter parser for Python
888fn get_python_parser() -> PatternsResult<Parser> {
889    let mut parser = Parser::new();
890    let language = tree_sitter_python::LANGUAGE;
891    parser.set_language(&language.into()).map_err(|e| {
892        PatternsError::parse_error(PathBuf::new(), format!("Failed to set language: {}", e))
893    })?;
894    Ok(parser)
895}
896
897// =============================================================================
898// File Analysis
899// =============================================================================
900
901type TemporalFileAnalysis = (HashMap<String, Vec<String>>, Vec<TemporalConstraint>);
902
903/// Analyze temporal constraints for a single file
904fn analyze_temporal_file(path: &Path, args: &TemporalArgs) -> PatternsResult<TemporalFileAnalysis> {
905    // Validate path
906    let canonical = if let Some(ref root) = args.project_root {
907        validate_file_path_in_project(path, root)?
908    } else {
909        validate_file_path(path)?
910    };
911
912    // Read source
913    let source = read_file_safe(&canonical)?;
914    let file_path_str = canonical.to_string_lossy().to_string();
915
916    // Detect language: explicit --lang flag wins, otherwise auto-detect
917    // from the file extension. Default to Python on failure to preserve
918    // backward compatibility with the original Python-only behaviour.
919    let language = args
920        .lang
921        .or_else(|| Language::from_path(&canonical))
922        .unwrap_or(Language::Python);
923
924    // Extract sequences via the language-aware path.
925    let file_seqs = extract_sequences_for_file(&canonical, &source, language)?;
926    let sequences = file_seqs.sequences;
927
928    // Mine bigrams
929    let (_, constraints) = mine_bigrams(&sequences, &file_path_str, args);
930
931    Ok((sequences, constraints))
932}
933
934/// Analyze temporal constraints for a directory
935fn analyze_temporal_directory(
936    path: &Path,
937    args: &TemporalArgs,
938    start_time: Instant,
939) -> PatternsResult<TemporalReport> {
940    let canonical = validate_directory_path(path)?;
941    let timeout = Duration::from_secs(args.timeout);
942
943    let mut all_sequences: HashMap<String, Vec<String>> = HashMap::new();
944    let mut all_examples: HashMap<(String, String), Vec<TemporalExample>> = HashMap::new();
945    let mut bigram_counts: HashMap<(String, String), u32> = HashMap::new();
946    let mut before_counts: HashMap<String, u32> = HashMap::new();
947    let mut files_analyzed = 0u32;
948
949    // VAL-016: Determine the project's language. Auto-detect from
950    // manifest precedence + extension majority unless --lang overrode it.
951    // Falls back to Python so a directory of `.py` files without a
952    // manifest still works exactly like before.
953    let resolved_lang = resolve_directory_language(&canonical, args);
954
955    // OCaml is supported by the call-graph builder but NOT by the
956    // single-file extract_calls_for_language API. To cover ocaml
957    // (and as a robustness net for any future skew between the two),
958    // we use the project-wide builder when the resolved language is
959    // OCaml — otherwise we use the per-file walker which is cheaper.
960    let use_project_builder = matches!(resolved_lang, Some(Language::Ocaml));
961
962    if use_project_builder {
963        // Project-wide path: build the full call-graph IR and iterate
964        // FileIR.calls per file. This routes through every language's
965        // call-graph handler (including OCaml).
966        let lang = resolved_lang.expect("checked above");
967        let mut config = BuildConfig {
968            language: lang.as_str().to_string(),
969            respect_ignore: false,
970            ..Default::default()
971        };
972        config.use_type_resolution = false;
973        match build_project_call_graph_v2(&canonical, config) {
974            Ok(ir) => {
975                for (file_path, file_ir) in &ir.files {
976                    if start_time.elapsed() > timeout {
977                        break;
978                    }
979                    files_analyzed += 1;
980                    if files_analyzed > args.max_files {
981                        break;
982                    }
983                    check_directory_file_count(files_analyzed as usize)?;
984
985                    // FileIR.path is relative to project root; rejoin.
986                    let abs_path = if file_path.is_absolute() {
987                        file_path.clone()
988                    } else {
989                        canonical.join(file_path)
990                    };
991                    let file_key = abs_path.to_string_lossy().to_string();
992                    let scoped = sequences_from_callsite_map(&file_key, &file_ir.calls);
993
994                    aggregate_file_sequences(
995                        &scoped,
996                        &file_key,
997                        &per_caller_first_line(&file_ir.calls),
998                        &mut all_sequences,
999                        &mut bigram_counts,
1000                        &mut before_counts,
1001                        &mut all_examples,
1002                        args,
1003                    );
1004                }
1005            }
1006            Err(_) => {
1007                // Builder failed — fall through to empty report. We do
1008                // not silently swallow errors elsewhere; the report's
1009                // metadata.files_analyzed=0 will trip the matrix's
1010                // SILENT_FAIL guard if this hits in practice.
1011            }
1012        }
1013    } else {
1014        // Per-file walker path (Python + 16 other languages).
1015        for entry in tldr_core::walker::walk_project(&canonical) {
1016            // Check timeout (E03 mitigation)
1017            if start_time.elapsed() > timeout {
1018                break;
1019            }
1020
1021            let entry_path = entry.path();
1022
1023            // VAL-016: dispatch on language detected from file extension.
1024            // Skip files with no recognised language (avoids parsing
1025            // markdown/yaml/etc.). The --lang flag, if provided, must
1026            // match the entry language too — otherwise we'd extract
1027            // sequences with a parser mis-matched to the file.
1028            let entry_lang = match Language::from_path(entry_path) {
1029                Some(lang) => lang,
1030                None => continue,
1031            };
1032            if let Some(forced) = args.lang {
1033                if forced != entry_lang {
1034                    continue;
1035                }
1036            } else if let Some(project_lang) = resolved_lang {
1037                if project_lang != entry_lang {
1038                    continue;
1039                }
1040            }
1041
1042            // Check file count limit
1043            files_analyzed += 1;
1044            if files_analyzed > args.max_files {
1045                break;
1046            }
1047            check_directory_file_count(files_analyzed as usize)?;
1048
1049            // Analyze file
1050            let file_path_str = entry_path.to_string_lossy().to_string();
1051            if let Ok(source) = read_file_safe(entry_path) {
1052                let file_seqs = match extract_sequences_for_file(entry_path, &source, entry_lang) {
1053                    Ok(s) => s,
1054                    Err(_) => continue,
1055                };
1056
1057                aggregate_file_sequences(
1058                    &file_seqs.sequences,
1059                    &file_path_str,
1060                    &file_seqs.first_line,
1061                    &mut all_sequences,
1062                    &mut bigram_counts,
1063                    &mut before_counts,
1064                    &mut all_examples,
1065                    args,
1066                );
1067            }
1068        }
1069    }
1070
1071    // Build constraints from aggregated data
1072    let mut constraints = Vec::new();
1073
1074    for ((before, after), count) in &bigram_counts {
1075        if *count < args.min_support {
1076            continue;
1077        }
1078
1079        let before_total = *before_counts.get(before).unwrap_or(&1);
1080        let confidence = (*count as f64) / (before_total as f64);
1081
1082        if confidence < args.min_confidence {
1083            continue;
1084        }
1085
1086        let examples = all_examples
1087            .get(&(before.clone(), after.clone()))
1088            .cloned()
1089            .unwrap_or_default();
1090
1091        constraints.push(TemporalConstraint {
1092            before: before.clone(),
1093            after: after.clone(),
1094            support: *count,
1095            confidence,
1096            examples,
1097        });
1098    }
1099
1100    // Sort by confidence, then support
1101    constraints.sort_by(|a, b| {
1102        b.confidence
1103            .partial_cmp(&a.confidence)
1104            .unwrap_or(std::cmp::Ordering::Equal)
1105            .then_with(|| b.support.cmp(&a.support))
1106    });
1107
1108    // Apply query filter if specified
1109    if let Some(ref query) = args.query {
1110        constraints = filter_by_query(constraints, query);
1111    }
1112
1113    // Mine trigrams if requested
1114    let trigrams = if args.include_trigrams {
1115        let mut trigrams = mine_trigrams(&all_sequences, args);
1116        if let Some(ref query) = args.query {
1117            trigrams = filter_trigrams_by_query(trigrams, query);
1118        }
1119        trigrams
1120    } else {
1121        Vec::new()
1122    };
1123
1124    let sequences_extracted: u32 = all_sequences.values().map(|v| v.len() as u32).sum();
1125
1126    Ok(TemporalReport {
1127        constraints,
1128        trigrams,
1129        metadata: TemporalMetadata {
1130            files_analyzed,
1131            sequences_extracted,
1132            min_support: args.min_support,
1133            min_confidence: args.min_confidence,
1134        },
1135    })
1136}
1137
1138// =============================================================================
1139// Text Formatting
1140// =============================================================================
1141
1142/// Format a temporal report as human-readable text
1143pub fn format_temporal_text(report: &TemporalReport) -> String {
1144    let mut lines = Vec::new();
1145
1146    lines.push("Temporal Constraints".to_string());
1147    lines.push("=".repeat(40));
1148    lines.push(String::new());
1149
1150    if report.constraints.is_empty() {
1151        lines.push("No constraints found matching criteria.".to_string());
1152    } else {
1153        lines.push(format!("Found {} constraints:", report.constraints.len()));
1154        lines.push(String::new());
1155
1156        for constraint in &report.constraints {
1157            lines.push(format!("  {} -> {}", constraint.before, constraint.after));
1158            lines.push(format!(
1159                "    support: {}, confidence: {:.2}",
1160                constraint.support, constraint.confidence
1161            ));
1162
1163            if !constraint.examples.is_empty() {
1164                lines.push("    examples:".to_string());
1165                for example in &constraint.examples {
1166                    lines.push(format!("      - {}:{}", example.file, example.line));
1167                }
1168            }
1169            lines.push(String::new());
1170        }
1171    }
1172
1173    if !report.trigrams.is_empty() {
1174        lines.push(String::new());
1175        lines.push("Trigrams".to_string());
1176        lines.push("-".repeat(40));
1177        lines.push(String::new());
1178
1179        for trigram in &report.trigrams {
1180            lines.push(format!(
1181                "  {} -> {} -> {}",
1182                trigram.sequence[0], trigram.sequence[1], trigram.sequence[2]
1183            ));
1184            lines.push(format!(
1185                "    support: {}, confidence: {:.2}",
1186                trigram.support, trigram.confidence
1187            ));
1188            lines.push(String::new());
1189        }
1190    }
1191
1192    lines.push(String::new());
1193    lines.push("Metadata".to_string());
1194    lines.push("-".repeat(40));
1195    lines.push(format!(
1196        "  Files analyzed: {}",
1197        report.metadata.files_analyzed
1198    ));
1199    lines.push(format!(
1200        "  Sequences extracted: {}",
1201        report.metadata.sequences_extracted
1202    ));
1203    lines.push(format!("  Min support: {}", report.metadata.min_support));
1204    lines.push(format!(
1205        "  Min confidence: {:.2}",
1206        report.metadata.min_confidence
1207    ));
1208
1209    lines.join("\n")
1210}
1211
1212// =============================================================================
1213// Entry Point
1214// =============================================================================
1215
1216/// Execute the temporal command
1217pub fn run(args: TemporalArgs, global_format: GlobalOutputFormat) -> anyhow::Result<()> {
1218    let start_time = Instant::now();
1219    let path = &args.path;
1220
1221    // VAL-016: validate the legacy `--source-lang` flag against the
1222    // 18 supported TLDR languages plus the synthetic "auto" sentinel.
1223    // The canonical way to override language is the global `--lang/-l`
1224    // flag (see `args.lang`); `--source-lang` is preserved only for
1225    // backward compatibility with the original Python-only CLI.
1226    let source_lang_norm = args.source_lang.to_lowercase();
1227    if source_lang_norm != "auto" && source_lang_norm.parse::<Language>().is_err() {
1228        return Err(PatternsError::UnsupportedLanguage {
1229            language: args.source_lang.clone(),
1230        }
1231        .into());
1232    }
1233
1234    let report = if path.is_dir() {
1235        analyze_temporal_directory(path, &args, start_time)?
1236    } else {
1237        let (sequences, mut constraints) = analyze_temporal_file(path, &args)?;
1238
1239        // Apply query filter if specified
1240        if let Some(ref query) = args.query {
1241            constraints = filter_by_query(constraints, query);
1242        }
1243
1244        // Mine trigrams if requested
1245        let trigrams = if args.include_trigrams {
1246            let mut trigrams = mine_trigrams(&sequences, &args);
1247            if let Some(ref query) = args.query {
1248                trigrams = filter_trigrams_by_query(trigrams, query);
1249            }
1250            trigrams
1251        } else {
1252            Vec::new()
1253        };
1254
1255        let sequences_extracted: u32 = sequences.values().map(|v| v.len() as u32).sum();
1256
1257        TemporalReport {
1258            constraints,
1259            trigrams,
1260            metadata: TemporalMetadata {
1261                files_analyzed: 1,
1262                sequences_extracted,
1263                min_support: args.min_support,
1264                min_confidence: args.min_confidence,
1265            },
1266        }
1267    };
1268
1269    // Resolve format: global -f flag takes priority over hidden --output-format
1270    let use_text = matches!(global_format, GlobalOutputFormat::Text)
1271        || matches!(args.output_format, OutputFormat::Text);
1272
1273    // Check if no constraints found -> exit code 2
1274    if report.constraints.is_empty() && report.trigrams.is_empty() {
1275        if use_text {
1276            println!("{}", format_temporal_text(&report));
1277        } else {
1278            let json = serde_json::to_string_pretty(&report)?;
1279            println!("{}", json);
1280        }
1281        std::process::exit(2);
1282    }
1283
1284    if use_text {
1285        println!("{}", format_temporal_text(&report));
1286    } else {
1287        let json = serde_json::to_string_pretty(&report)?;
1288        println!("{}", json);
1289    }
1290
1291    Ok(())
1292}
1293
1294// =============================================================================
1295// Tests
1296// =============================================================================
1297
1298#[cfg(test)]
1299mod tests {
1300    use super::*;
1301
1302    #[test]
1303    fn test_extract_sequences_simple() {
1304        let code = r#"
1305def read_config(path):
1306    f = open(path)
1307    content = f.read()
1308    f.close()
1309    return content
1310"#;
1311        let sequences = extract_sequences(code);
1312
1313        // Should have a sequence for f
1314        let has_f_sequence = sequences.keys().any(|k| k.contains(":f"));
1315        assert!(has_f_sequence, "Should extract sequence for variable f");
1316    }
1317
1318    #[test]
1319    fn test_bigram_counter() {
1320        let mut sequences = HashMap::new();
1321        sequences.insert(
1322            "func:f".to_string(),
1323            vec!["open".to_string(), "read".to_string(), "close".to_string()],
1324        );
1325
1326        let mut counter = BigramCounter::new();
1327        counter.add_sequences(&sequences, "test.py");
1328
1329        assert_eq!(
1330            counter
1331                .counts
1332                .get(&("open".to_string(), "read".to_string())),
1333            Some(&1)
1334        );
1335        assert_eq!(
1336            counter
1337                .counts
1338                .get(&("read".to_string(), "close".to_string())),
1339            Some(&1)
1340        );
1341    }
1342
1343    #[test]
1344    fn test_mine_bigrams_filter() {
1345        let mut sequences = HashMap::new();
1346        sequences.insert(
1347            "func:f".to_string(),
1348            vec!["open".to_string(), "read".to_string(), "close".to_string()],
1349        );
1350
1351        let args = TemporalArgs {
1352            path: PathBuf::new(),
1353            min_support: 1,
1354            min_confidence: 0.0,
1355            query: None,
1356            source_lang: "python".to_string(),
1357            max_files: 1000,
1358            include_trigrams: false,
1359            include_examples: 3,
1360            output_format: OutputFormat::Json,
1361            timeout: 60,
1362            project_root: None,
1363            lang: None,
1364        };
1365
1366        let (_, constraints) = mine_bigrams(&sequences, "test.py", &args);
1367
1368        assert!(!constraints.is_empty(), "Should find bigram constraints");
1369    }
1370
1371    #[test]
1372    fn test_filter_by_query() {
1373        let constraints = vec![
1374            TemporalConstraint {
1375                before: "open".to_string(),
1376                after: "read".to_string(),
1377                support: 5,
1378                confidence: 0.8,
1379                examples: vec![],
1380            },
1381            TemporalConstraint {
1382                before: "acquire".to_string(),
1383                after: "release".to_string(),
1384                support: 3,
1385                confidence: 0.9,
1386                examples: vec![],
1387            },
1388        ];
1389
1390        let filtered = filter_by_query(constraints, "open");
1391        assert_eq!(filtered.len(), 1);
1392        assert_eq!(filtered[0].before, "open");
1393    }
1394
1395    #[test]
1396    fn test_mine_trigrams_limit() {
1397        // Create sequences that would generate many trigrams
1398        let mut sequences = HashMap::new();
1399        let calls: Vec<String> = (0..100).map(|i| format!("method{}", i)).collect();
1400        sequences.insert("func:obj".to_string(), calls);
1401
1402        let args = TemporalArgs {
1403            path: PathBuf::new(),
1404            min_support: 1,
1405            min_confidence: 0.0,
1406            query: None,
1407            source_lang: "python".to_string(),
1408            max_files: 1000,
1409            include_trigrams: true,
1410            include_examples: 3,
1411            output_format: OutputFormat::Json,
1412            timeout: 60,
1413            project_root: None,
1414            lang: None,
1415        };
1416
1417        let trigrams = mine_trigrams(&sequences, &args);
1418
1419        // Should respect MAX_TRIGRAMS limit
1420        assert!(trigrams.len() <= MAX_TRIGRAMS);
1421    }
1422
1423    #[test]
1424    fn test_format_temporal_text() {
1425        let report = TemporalReport {
1426            constraints: vec![TemporalConstraint {
1427                before: "open".to_string(),
1428                after: "close".to_string(),
1429                support: 10,
1430                confidence: 0.95,
1431                examples: vec![TemporalExample {
1432                    file: "test.py".to_string(),
1433                    line: 5,
1434                }],
1435            }],
1436            trigrams: vec![],
1437            metadata: TemporalMetadata {
1438                files_analyzed: 1,
1439                sequences_extracted: 5,
1440                min_support: 2,
1441                min_confidence: 0.5,
1442            },
1443        };
1444
1445        let text = format_temporal_text(&report);
1446        assert!(text.contains("open -> close"));
1447        assert!(text.contains("support: 10"));
1448        assert!(text.contains("confidence: 0.95"));
1449    }
1450
1451    #[test]
1452    fn test_temporal_args_lang_flag() {
1453        use tldr_core::types::Language;
1454
1455        // Verify TemporalArgs has a lang field of type Option<Language>
1456        let args = TemporalArgs {
1457            path: PathBuf::from("src/"),
1458            min_support: 2,
1459            min_confidence: 0.5,
1460            query: None,
1461            source_lang: "python".to_string(),
1462            max_files: 1000,
1463            include_trigrams: false,
1464            include_examples: 3,
1465            output_format: OutputFormat::Json,
1466            timeout: 60,
1467            project_root: None,
1468            lang: Some(Language::Python),
1469        };
1470        assert_eq!(args.lang, Some(Language::Python));
1471
1472        // Also test None case (auto-detect)
1473        let args_auto = TemporalArgs {
1474            path: PathBuf::from("src/"),
1475            min_support: 2,
1476            min_confidence: 0.5,
1477            query: None,
1478            source_lang: "python".to_string(),
1479            max_files: 1000,
1480            include_trigrams: false,
1481            include_examples: 3,
1482            output_format: OutputFormat::Json,
1483            timeout: 60,
1484            project_root: None,
1485            lang: None,
1486        };
1487        assert_eq!(args_auto.lang, None);
1488    }
1489
1490    // ====================================================================
1491    // VAL-016: per-language sequence-extraction unit tests
1492    // ====================================================================
1493    //
1494    // Each test asserts that `extract_sequences_for_file` returns a
1495    // sequence containing the bigram `helper -> b_util` when fed a tiny
1496    // function that calls `helper()` then `b_util()` in source order.
1497    // The fixture mirrors the canonical 18-language matrix fixture in
1498    // crates/tldr-cli/tests/fixtures/mod.rs.
1499
1500    use std::io::Write;
1501
1502    /// Helper: write `source` to a temp file with `extension`, run the
1503    /// generalized extractor, and return the merged list of sequences.
1504    fn extract_for_lang(extension: &str, source: &str, language: Language) -> Vec<Vec<String>> {
1505        let mut tmp = tempfile::Builder::new()
1506            .suffix(&format!(".{}", extension))
1507            .tempfile()
1508            .expect("tempfile");
1509        tmp.write_all(source.as_bytes()).expect("write source");
1510        let path = tmp.path().to_path_buf();
1511        let file_seqs = extract_sequences_for_file(&path, source, language)
1512            .expect("extract_sequences_for_file");
1513        file_seqs.sequences.into_values().collect()
1514    }
1515
1516    /// Helper: assert the extracted sequences contain a `helper -> b_util`
1517    /// adjacency in some scope. Built on top of `windows(2)` so it stays
1518    /// agnostic to scope/key formatting differences across languages.
1519    fn assert_helper_then_b_util(seqs: &[Vec<String>], language_label: &str) {
1520        let found = seqs
1521            .iter()
1522            .any(|seq| seq.windows(2).any(|w| w[0] == "helper" && w[1] == "b_util"));
1523        assert!(
1524            found,
1525            "[{}] expected `helper -> b_util` bigram, got: {:?}",
1526            language_label, seqs
1527        );
1528    }
1529
1530    #[test]
1531    fn test_extract_sequences_typescript() {
1532        // TypeScript: function main() { helper(); b_util(); }
1533        let source = "\
1534function helper(): number { return 1; }
1535function b_util(): number { return 2; }
1536function main(): void {
1537  helper();
1538  b_util();
1539}
1540";
1541        let seqs = extract_for_lang("ts", source, Language::TypeScript);
1542        assert_helper_then_b_util(&seqs, "typescript");
1543    }
1544
1545    #[test]
1546    fn test_extract_sequences_java() {
1547        // Java: methods inside a class. The Java callgraph handler
1548        // qualifies callers as `Main.main`; the bigram still fires.
1549        let source = "\
1550class Main {
1551    public static int helper() { return 1; }
1552    public static int bUtil() { return 2; }
1553    public static void main(String[] args) {
1554        helper();
1555        bUtil();
1556    }
1557}
1558";
1559        // We call the helper b_util via `bUtil` (Java idiom). Adjust the
1560        // assertion accordingly.
1561        let mut tmp = tempfile::Builder::new().suffix(".java").tempfile().unwrap();
1562        tmp.write_all(source.as_bytes()).unwrap();
1563        let path = tmp.path().to_path_buf();
1564        let file_seqs = extract_sequences_for_file(&path, source, Language::Java).expect("extract");
1565        let seqs: Vec<Vec<String>> = file_seqs.sequences.into_values().collect();
1566        let found = seqs
1567            .iter()
1568            .any(|seq| seq.windows(2).any(|w| w[0] == "helper" && w[1] == "bUtil"));
1569        assert!(
1570            found,
1571            "[java] expected `helper -> bUtil` bigram, got: {:?}",
1572            seqs
1573        );
1574    }
1575
1576    #[test]
1577    fn test_extract_sequences_go() {
1578        // Go: func main calls helper() then b_util()
1579        let source = "\
1580package main
1581
1582func helper() int { return 1 }
1583func b_util() int { return 2 }
1584func main() {
1585    helper()
1586    b_util()
1587}
1588";
1589        let seqs = extract_for_lang("go", source, Language::Go);
1590        assert_helper_then_b_util(&seqs, "go");
1591    }
1592
1593    #[test]
1594    fn test_extract_sequences_rust() {
1595        // Rust: fn main calls helper() then b_util()
1596        let source = "\
1597fn helper() -> i32 { 1 }
1598fn b_util() -> i32 { 2 }
1599fn main() {
1600    let _ = helper();
1601    let _ = b_util();
1602}
1603";
1604        let seqs = extract_for_lang("rs", source, Language::Rust);
1605        assert_helper_then_b_util(&seqs, "rust");
1606    }
1607
1608    #[test]
1609    fn test_extract_sequences_python_via_generalized_path() {
1610        // Python regression — the new dispatch must still emit the
1611        // helper -> b_util bigram for Python (via the legacy walker).
1612        let source = "\
1613def helper():
1614    return 1
1615
1616def b_util():
1617    return 2
1618
1619def main():
1620    helper()
1621    b_util()
1622";
1623        let seqs = extract_for_lang("py", source, Language::Python);
1624        assert_helper_then_b_util(&seqs, "python");
1625    }
1626
1627    #[test]
1628    fn test_extract_sequences_python_legacy_receiver_aware() {
1629        // Python regression — the legacy receiver-aware walker must
1630        // still emit `[open, read, close]` keyed by `<func>:f`. This
1631        // covers the bespoke "with statement implies __exit__" logic
1632        // that the call-graph IR doesn't model.
1633        let source = "\
1634def read_config(path):
1635    f = open(path)
1636    content = f.read()
1637    f.close()
1638    return content
1639";
1640        let mut tmp = tempfile::Builder::new().suffix(".py").tempfile().unwrap();
1641        tmp.write_all(source.as_bytes()).unwrap();
1642        let path = tmp.path().to_path_buf();
1643        let file_seqs =
1644            extract_sequences_for_file(&path, source, Language::Python).expect("extract");
1645        let has_open_read = file_seqs
1646            .sequences
1647            .values()
1648            .any(|seq| seq.windows(2).any(|w| w[0] == "open" && w[1] == "read"));
1649        assert!(
1650            has_open_read,
1651            "python legacy: expected `open -> read` bigram for receiver f, got: {:?}",
1652            file_seqs.sequences
1653        );
1654    }
1655
1656    #[test]
1657    fn test_sequences_from_callsite_map_orders_by_line() {
1658        // Unit test for the line-sort invariant. Two CallSites for the
1659        // same caller delivered out of order by line must come back
1660        // sorted ascending.
1661        use tldr_core::callgraph::CallSite;
1662        let mut calls: HashMap<String, Vec<CallSite>> = HashMap::new();
1663        calls.insert(
1664            "main".to_string(),
1665            vec![
1666                // intentionally deliver line 8 first
1667                CallSite::direct("main".to_string(), "b_util".to_string(), Some(8)),
1668                CallSite::direct("main".to_string(), "helper".to_string(), Some(7)),
1669            ],
1670        );
1671        let out = sequences_from_callsite_map("/tmp/foo", &calls);
1672        let main_seq = out.get("/tmp/foo::main").expect("main sequence");
1673        assert_eq!(
1674            main_seq,
1675            &vec!["helper".to_string(), "b_util".to_string()],
1676            "calls must be ordered by line ascending (sequences_from_callsite_map)"
1677        );
1678    }
1679}