cqs 1.22.0

Code intelligence and RAG for AI agents. Semantic search, call graphs, impact analysis, type dependencies, and smart context assembly — in single tool calls. 54 languages + L5X/L5K PLC exports, 91.2% Recall@1 (BGE-large), 0.951 MRR (296 queries). Local ML, GPU-accelerated.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
pub mod bm25;
pub mod checkpoint;
pub mod diff;
pub mod git;
pub mod query;

use std::collections::HashMap;
use std::fs::{File, OpenOptions};
use std::io::{BufWriter, Write};
use std::path::{Path, PathBuf};

use crate::parser::{Chunk, Language, Parser};

use self::bm25::Bm25Index;
use self::checkpoint::{read_checkpoints, truncate_incomplete_line, write_checkpoint};
use self::diff::{find_changed_functions, parse_diff_output, FunctionSpan};
use self::git::{git_diff_tree, git_log, git_show, is_shallow};
use self::query::normalize_query;

// ─── Error ──────────────────────────────────────────────────────────────────

#[derive(Debug, thiserror::Error)]
pub enum TrainDataError {
    #[error("Git error: {0}")]
    Git(String),
    #[error("IO error: {0}")]
    Io(#[from] std::io::Error),
    #[error("Parser error: {0}")]
    Parser(#[from] crate::parser::ParserError),
    #[error("JSON error: {0}")]
    Json(#[from] serde_json::Error),
    #[error("Invalid repo: {0}")]
    InvalidRepo(String),
}

// ─── Types ──────────────────────────────────────────────────────────────────

/// A single training triplet: query + positive + hard negatives.
#[derive(Debug, serde::Serialize)]
pub struct Triplet {
    pub query: String,
    pub raw_query: String,
    pub positive: String,
    pub negatives: Vec<String>,
    pub repo: String,
    pub commit: String,
    pub file: String,
    pub function_name: String,
    pub language: String,
    pub files_changed: usize,
    pub msg_len: usize,
    pub diff_lines: usize,
    pub function_size: usize,
    pub commit_date: String,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub chunk_type: Option<String>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub caller_count: Option<usize>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub callee_count: Option<usize>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub has_doc_comment: Option<bool>,
}

/// Configuration for training data generation.
#[derive(Debug, serde::Serialize)]
pub struct TrainDataConfig {
    pub repos: Vec<PathBuf>,
    pub output: PathBuf,
    pub max_commits: usize,
    pub min_msg_len: usize,
    pub max_files: usize,
    pub dedup_cap: usize,
    pub resume: bool,
    pub verbose: bool,
}

/// Statistics from a training data generation run.
#[derive(Debug, serde::Serialize)]
pub struct TrainDataStats {
    pub total_triplets: usize,
    pub repos_processed: usize,
    pub commits_processed: usize,
    pub commits_skipped: usize,
    pub parse_failures: usize,
    pub language_counts: HashMap<String, usize>,
}

// ─── Orchestration ──────────────────────────────────────────────────────────

/// Generate training data JSONL from git history across one or more repos.
/// For each repo: walks HEAD files to build a BM25 corpus, then iterates
/// commits to find changed functions. Each changed function produces one
/// triplet with the normalized commit message as query, the function content
/// as positive, and BM25-selected hard negatives.
pub fn generate_training_data(config: &TrainDataConfig) -> Result<TrainDataStats, TrainDataError> {
    let _span = tracing::info_span!("generate_training_data").entered();

    let parser = Parser::new().map_err(|e| TrainDataError::InvalidRepo(format!("{}", e)))?;

    // Checkpoint path is output path + ".checkpoint"
    let checkpoint_path = config.output.with_extension("jsonl.checkpoint");

    // Load checkpoints for resume
    let checkpoints = if config.resume {
        truncate_incomplete_line(&config.output)?;
        read_checkpoints(&checkpoint_path)?
    } else {
        HashMap::new()
    };

    // Open output file (append if resume, create/truncate otherwise)
    let output_file = if config.resume {
        OpenOptions::new()
            .create(true)
            .append(true)
            .open(&config.output)?
    } else {
        File::create(&config.output)?
    };
    let mut writer = BufWriter::new(output_file);

    let mut stats = TrainDataStats {
        total_triplets: 0,
        repos_processed: 0,
        commits_processed: 0,
        commits_skipped: 0,
        parse_failures: 0,
        language_counts: HashMap::new(),
    };

    for repo_path in &config.repos {
        let repo_str = repo_path.display().to_string();
        let _repo_span = tracing::info_span!("repo", repo = %repo_str).entered();

        // Validate repo
        if !repo_path.join(".git").exists() && !repo_path.join("HEAD").exists() {
            tracing::warn!(repo = %repo_str, "Not a git repository, skipping");
            continue;
        }

        // Shallow clone warning
        if is_shallow(repo_path) {
            tracing::warn!(
                repo = %repo_str,
                "Repository is a shallow clone — limited commit history"
            );
        }

        // Step 1: Build BM25 corpus from HEAD files on disk
        let bm25_docs = build_bm25_corpus(repo_path, &parser);
        let bm25 = Bm25Index::build(&bm25_docs);
        tracing::info!(
            repo = %repo_str,
            functions = bm25_docs.len(),
            "Built BM25 index from HEAD"
        );

        // Step 2: Walk git log
        let commits = git_log(repo_path, config.max_commits)?;
        let checkpoint_sha = checkpoints.get(&repo_str).cloned();

        // Track dedup per repo: content_hash -> count
        let mut dedup: HashMap<String, usize> = HashMap::new();

        let mut repo_triplets = 0usize;
        let mut past_checkpoint = checkpoint_sha.is_none();

        for (commit_idx, commit) in commits.iter().enumerate() {
            // Skip commits already processed (before checkpoint SHA)
            if !past_checkpoint {
                if checkpoint_sha
                    .as_ref()
                    .is_some_and(|sha| commit.sha == **sha)
                {
                    past_checkpoint = true;
                    stats.commits_skipped += 1;
                    continue;
                }
                stats.commits_skipped += 1;
                continue;
            }

            // Skip short messages
            if commit.message.len() < config.min_msg_len {
                stats.commits_skipped += 1;
                continue;
            }

            // Get diff
            let diff_str = match git_diff_tree(repo_path, &commit.sha) {
                Ok(d) => d,
                Err(e) => {
                    tracing::warn!(sha = %commit.sha, error = %e, "Failed to get diff");
                    stats.commits_skipped += 1;
                    continue;
                }
            };

            let diff_files = parse_diff_output(&diff_str);

            // Skip if too many files or empty
            if diff_files.is_empty() || diff_files.len() > config.max_files {
                stats.commits_skipped += 1;
                // Still write checkpoint so we don't re-visit
                write_checkpoint(&checkpoint_path, &repo_str, &commit.sha)?;
                continue;
            }

            let files_changed = diff_files.len();
            let raw_query = commit.message.clone();
            let query = normalize_query(&raw_query);

            // Process each changed file
            for diff_file in &diff_files {
                // Check extension is supported
                let ext = match Path::new(&diff_file.path).extension() {
                    Some(e) => e.to_string_lossy().to_string(),
                    None => continue,
                };

                let language = match Language::from_extension(&ext) {
                    Some(l) => l,
                    None => continue,
                };

                // Get file content at this commit
                let content = match git_show(repo_path, &commit.sha, &diff_file.path) {
                    Ok(Some(c)) => c,
                    Ok(None) => continue, // oversized or binary
                    Err(_) => continue,   // file doesn't exist at this commit
                };

                // Parse to get function spans (catch panics from malformed content)
                let parse_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
                    parser.parse_source(&content, language, Path::new(&diff_file.path))
                }));
                let chunks = match parse_result {
                    Ok(Ok(c)) => c,
                    Ok(Err(e)) => {
                        tracing::debug!(
                            file = %diff_file.path,
                            sha = %commit.sha,
                            error = %e,
                            "Parse failed"
                        );
                        stats.parse_failures += 1;
                        continue;
                    }
                    Err(_) => {
                        tracing::warn!(
                            file = %diff_file.path,
                            sha = %commit.sha,
                            "Parse panicked — skipping"
                        );
                        stats.parse_failures += 1;
                        continue;
                    }
                };

                let functions: Vec<FunctionSpan> = chunks_to_function_spans(&chunks);
                let changed = find_changed_functions(&diff_file.hunks, &functions);
                let diff_lines = diff_file.total_added_lines();

                for func in &changed {
                    // Dedup by content hash
                    let content_hash = blake3::hash(func.content.as_bytes()).to_hex().to_string();
                    let count = dedup.entry(content_hash.clone()).or_insert(0);
                    *count += 1;
                    if *count > config.dedup_cap {
                        continue;
                    }

                    // Select hard negatives
                    let negatives_raw =
                        bm25.select_negatives(&query, &content_hash, &func.content, 5);
                    let negatives: Vec<String> =
                        negatives_raw.into_iter().map(|(_, c)| c).collect();

                    let triplet = Triplet {
                        query: query.clone(),
                        raw_query: raw_query.clone(),
                        positive: func.content.clone(),
                        negatives,
                        repo: repo_str.clone(),
                        commit: commit.sha.clone(),
                        file: diff_file.path.clone(),
                        function_name: func.name.clone(),
                        language: language.to_string(),
                        files_changed,
                        msg_len: raw_query.len(),
                        diff_lines,
                        function_size: func.content.len(),
                        commit_date: commit.date.clone(),
                        chunk_type: None,
                        caller_count: None,
                        callee_count: None,
                        has_doc_comment: None,
                    };

                    serde_json::to_writer(&mut writer, &triplet)?;
                    writer.write_all(b"\n")?;

                    stats.total_triplets += 1;
                    repo_triplets += 1;
                    *stats
                        .language_counts
                        .entry(language.to_string())
                        .or_insert(0) += 1;
                }
            }

            stats.commits_processed += 1;
            write_checkpoint(&checkpoint_path, &repo_str, &commit.sha)?;

            if config.verbose {
                tracing::debug!(
                    sha = %commit.sha,
                    msg = %commit.message,
                    triplets = repo_triplets,
                    "Processed commit"
                );
            }

            // Progress every 100 commits
            if (commit_idx + 1) % 100 == 0 {
                tracing::info!(
                    repo = %repo_str,
                    commits = commit_idx + 1,
                    triplets = repo_triplets,
                    "Progress"
                );
            }
        }

        // RM-1: Warn if dedup map grows excessively large (memory guard)
        if dedup.len() > 100_000 {
            tracing::warn!(
                entries = dedup.len(),
                repo = %repo_str,
                "dedup map exceeds 100K entries — high memory usage"
            );
        }

        stats.repos_processed += 1;
        tracing::info!(
            repo = %repo_str,
            triplets = repo_triplets,
            commits = stats.commits_processed,
            skipped = stats.commits_skipped,
            "Repo complete"
        );
    }

    writer.flush()?;

    tracing::info!(
        total_triplets = stats.total_triplets,
        repos = stats.repos_processed,
        commits = stats.commits_processed,
        skipped = stats.commits_skipped,
        parse_failures = stats.parse_failures,
        "Training data generation complete"
    );

    Ok(stats)
}

// ─── Helpers ────────────────────────────────────────────────────────────────

/// Convert parser Chunks into FunctionSpans for diff intersection.
fn chunks_to_function_spans(chunks: &[Chunk]) -> Vec<FunctionSpan> {
    chunks
        .iter()
        .map(|c| FunctionSpan {
            name: c.name.clone(),
            start_line: c.line_start as usize,
            end_line: c.line_end as usize,
            content: c.content.clone(),
        })
        .collect()
}

/// Walk a repo's files on disk, parse them, and build BM25 corpus.
/// Returns (content_hash, content) pairs for each function found.
/// Uses the `ignore` crate to respect .gitignore.
fn build_bm25_corpus(repo_path: &Path, parser: &Parser) -> Vec<(String, String)> {
    let _span = tracing::info_span!("build_bm25_corpus", repo = %repo_path.display()).entered();

    let mut docs: Vec<(String, String)> = Vec::new();

    let walker = ignore::WalkBuilder::new(repo_path)
        .hidden(true) // skip dotfiles
        .git_ignore(true)
        .build();

    let mut corpus_parse_failures: usize = 0;
    for entry in walker.flatten() {
        if !entry.file_type().is_some_and(|ft| ft.is_file()) {
            continue;
        }
        let path = entry.path();

        // Check extension is supported
        let ext = match path.extension() {
            Some(e) => e.to_string_lossy().to_string(),
            None => continue,
        };
        if Language::from_extension(&ext).is_none() {
            continue;
        }

        // Parse file (catch panics from malformed content)
        let path_owned = path.to_path_buf();
        let chunks = match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
            parser.parse_file(&path_owned)
        })) {
            Ok(Ok(c)) => c,
            Ok(Err(_)) | Err(_) => {
                corpus_parse_failures += 1;
                continue;
            }
        };

        for chunk in &chunks {
            // Only callable chunks from programming languages as negatives.
            // Config files (TOML, YAML, JSON, INI) and docs (Markdown) produce
            // chunks that are too easy to discriminate — the base model already
            // handles code-vs-prose distinction. Training budget should go toward
            // hard negatives: similar-looking code functions with different purposes.
            if !chunk.chunk_type.is_callable() {
                continue;
            }
            if matches!(
                chunk.language,
                Language::Toml
                    | Language::Yaml
                    | Language::Json
                    | Language::Ini
                    | Language::Markdown
                    | Language::Xml
                    | Language::Html
                    | Language::Css
                    | Language::Latex
            ) {
                continue;
            }
            let hash = blake3::hash(chunk.content.as_bytes()).to_hex().to_string();
            docs.push((hash, chunk.content.clone()));
        }
    }

    if corpus_parse_failures > 0 {
        tracing::warn!(corpus_parse_failures, "Corpus files failed to parse");
    }

    docs
}

// ─── Tests ──────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;
    use std::process::Command;

    /// Create a test git repo with 3 commits and 2 files.
    /// Commit 1: initial — test.rs with `fn hello()`
    /// Commit 2: add greeting — test.rs modified, utils.rs added with `fn greet()`
    /// Commit 3: add farewell to utils — utils.rs modified with `fn farewell()`
    fn create_test_repo() -> tempfile::TempDir {
        let dir = tempfile::TempDir::new().unwrap();
        let repo = dir.path();

        // git init + config
        run_git(repo, &["init"]);
        run_git(repo, &["config", "user.email", "test@test.com"]);
        run_git(repo, &["config", "user.name", "Test"]);

        // Commit 1: initial
        std::fs::write(
            repo.join("test.rs"),
            "fn hello() {\n    println!(\"hi\");\n}\n",
        )
        .unwrap();
        run_git(repo, &["add", "."]);
        run_git(
            repo,
            &["commit", "-m", "initial commit with hello function"],
        );

        // Commit 2: modify test.rs + add utils.rs
        std::fs::write(
            repo.join("test.rs"),
            "fn hello() {\n    println!(\"hello world\");\n}\n\nfn goodbye() {\n    println!(\"bye\");\n}\n",
        )
        .unwrap();
        std::fs::write(
            repo.join("utils.rs"),
            "fn greet(name: &str) {\n    println!(\"Hello, {}!\", name);\n}\n",
        )
        .unwrap();
        run_git(repo, &["add", "."]);
        run_git(
            repo,
            &["commit", "-m", "add greeting utilities and goodbye"],
        );

        // Commit 3: modify utils.rs
        std::fs::write(
            repo.join("utils.rs"),
            "fn greet(name: &str) {\n    println!(\"Hello, {}!\", name);\n}\n\nfn farewell(name: &str) {\n    println!(\"Goodbye, {}!\", name);\n}\n",
        )
        .unwrap();
        run_git(repo, &["add", "."]);
        run_git(repo, &["commit", "-m", "add farewell function to utils"]);

        dir
    }

    /// Executes a git command in the specified repository directory.
    /// # Arguments
    /// * `repo` - The path to the git repository where the command should be executed
    /// * `args` - The git subcommand and arguments to execute
    /// # Panics
    /// Panics if the git command fails or returns a non-zero exit status. The panic message includes the attempted git arguments and the stderr output from the failed command.
    fn run_git(repo: &Path, args: &[&str]) {
        let output = Command::new("git")
            .args(["-C"])
            .arg(repo)
            .args(args)
            .output()
            .unwrap();
        assert!(
            output.status.success(),
            "git {:?} failed: {}",
            args,
            String::from_utf8_lossy(&output.stderr)
        );
    }
    /// Generates and validates training data from a test repository, verifying that the training data generation process correctly produces triplet data in JSONL format.
    /// # Arguments
    /// None. This is an integration test that creates its own test fixtures.
    /// # Returns
    /// None. This function performs assertions to validate the training data generation process.
    /// # Panics
    /// Panics if:
    /// - The test repository creation fails
    /// - The temporary output directory creation fails
    /// - Training data generation fails unexpectedly
    /// - The generated JSONL output file cannot be read
    /// - Any line in the JSONL output is not valid JSON
    /// - Required fields ("query", "positive", "negative") are missing from generated triplets
    /// - Statistical assertions fail (zero triplets generated, unexpected repo/commit counts)

    #[test]
    fn integration_generate_training_data() {
        let repo_dir = create_test_repo();
        let out_dir = tempfile::TempDir::new().unwrap();
        let output_path = out_dir.path().join("train.jsonl");

        let config = TrainDataConfig {
            repos: vec![repo_dir.path().to_path_buf()],
            output: output_path.clone(),
            max_commits: 0,
            min_msg_len: 10,
            max_files: 20,
            dedup_cap: 5,
            resume: false,
            verbose: true,
        };

        let stats = generate_training_data(&config).unwrap();

        // We should get triplets from commits 2 and 3 (commit 1 is "initial commit"
        // which is 32 chars, so not skipped by min_msg_len=10)
        assert!(
            stats.total_triplets > 0,
            "Expected some triplets, got {}",
            stats.total_triplets
        );
        assert_eq!(stats.repos_processed, 1);
        assert!(stats.commits_processed > 0);

        // Verify JSONL output is valid
        let content = std::fs::read_to_string(&output_path).unwrap();
        assert!(!content.is_empty(), "Output file should not be empty");

        for line in content.lines() {
            let triplet: serde_json::Value = serde_json::from_str(line)
                .unwrap_or_else(|e| panic!("Invalid JSON line: {}\n{}", e, line));

            // Check required fields
            assert!(triplet.get("query").is_some(), "Missing query field");
            assert!(triplet.get("positive").is_some(), "Missing positive field");
            assert!(
                triplet.get("negatives").is_some(),
                "Missing negatives field"
            );
            assert!(triplet.get("repo").is_some(), "Missing repo field");
            assert!(triplet.get("commit").is_some(), "Missing commit field");
            assert!(triplet.get("file").is_some(), "Missing file field");
            assert!(
                triplet.get("function_name").is_some(),
                "Missing function_name"
            );
            assert!(triplet.get("language").is_some(), "Missing language");
            assert!(triplet.get("commit_date").is_some(), "Missing commit_date");
            assert!(triplet.get("diff_lines").is_some(), "Missing diff_lines");
            assert!(
                triplet.get("function_size").is_some(),
                "Missing function_size"
            );

            // Query should be normalized (no conventional prefix)
            let q = triplet["query"].as_str().unwrap();
            assert!(
                !q.starts_with("add ") && !q.starts_with("fix "),
                "Query not normalized: {}",
                q
            );

            // Language should be rust
            assert_eq!(triplet["language"].as_str().unwrap(), "rust");
        }

        // Checkpoint file should exist
        let checkpoint_path = output_path.with_extension("jsonl.checkpoint");
        assert!(
            checkpoint_path.exists(),
            "Checkpoint file should exist at {}",
            checkpoint_path.display()
        );
    }
    /// Verifies that resuming training data generation does not produce duplicate entries.
    /// # Arguments
    /// This is an integration test with no parameters.
    /// # Description
    /// Creates a test repository and generates training data in two runs: an initial run followed by a resumed run. Validates that the resumed run produces no new triplets and that the output file contains the same number of lines as after the first run, ensuring no duplicates are created when resuming from a checkpoint.
    /// # Panics
    /// Panics if the training data generation fails, if file operations fail, or if assertions about duplicate prevention are violated.

    #[test]
    fn integration_resume_produces_no_duplicates() {
        let repo_dir = create_test_repo();
        let out_dir = tempfile::TempDir::new().unwrap();
        let output_path = out_dir.path().join("train.jsonl");

        let config = TrainDataConfig {
            repos: vec![repo_dir.path().to_path_buf()],
            output: output_path.clone(),
            max_commits: 0,
            min_msg_len: 10,
            max_files: 20,
            dedup_cap: 5,
            resume: false,
            verbose: false,
        };

        // First run
        let stats1 = generate_training_data(&config).unwrap();
        let first_count = std::fs::read_to_string(&output_path)
            .unwrap()
            .lines()
            .count();

        // Second run with resume
        let config_resume = TrainDataConfig {
            repos: vec![repo_dir.path().to_path_buf()],
            output: output_path.clone(),
            max_commits: 0,
            min_msg_len: 10,
            max_files: 20,
            dedup_cap: 5,
            resume: true,
            verbose: false,
        };

        let stats2 = generate_training_data(&config_resume).unwrap();
        let second_count = std::fs::read_to_string(&output_path)
            .unwrap()
            .lines()
            .count();

        // Resume should produce no new triplets
        assert_eq!(
            first_count, second_count,
            "Resume should not produce duplicates (first: {}, second: {})",
            first_count, second_count
        );
        assert_eq!(
            stats2.total_triplets, 0,
            "Resume run should emit 0 new triplets"
        );
        assert!(
            stats1.total_triplets > 0,
            "First run should have produced triplets"
        );
    }
    /// Tests that the training data generation correctly skips directories that are not Git repositories.
    /// # Arguments
    /// This function takes no parameters. It creates temporary directories and a configuration for testing purposes.
    /// # Returns
    /// This function returns nothing. It performs assertions to verify that when `generate_training_data` is called on a non-Git repository directory, it processes 0 repositories and generates 0 triplets.
    /// # Panics
    /// Panics if the temporary directory creation fails, if `generate_training_data` returns an error, or if the assertions about processed repositories or triplets fail.

    #[test]
    fn skips_non_git_repos() {
        let dir = tempfile::TempDir::new().unwrap();
        let out_dir = tempfile::TempDir::new().unwrap();
        let output_path = out_dir.path().join("train.jsonl");

        let config = TrainDataConfig {
            repos: vec![dir.path().to_path_buf()],
            output: output_path,
            max_commits: 0,
            min_msg_len: 10,
            max_files: 20,
            dedup_cap: 5,
            resume: false,
            verbose: false,
        };

        let stats = generate_training_data(&config).unwrap();
        assert_eq!(stats.repos_processed, 0);
        assert_eq!(stats.total_triplets, 0);
    }
    /// Verifies that the dedup_cap parameter correctly limits the number of triplets generated per unique function content.
    /// # Arguments
    /// This is a test function with no parameters.
    /// # Returns
    /// Returns nothing. The function performs assertions to validate the deduplication cap behavior.
    /// # Panics
    /// Panics if the assertion fails, indicating that the capped deduplication (cap=1) produced more triplets than the uncapped version (cap=100), which would indicate incorrect deduplication behavior.

    #[test]
    fn dedup_cap_limits_triplets() {
        let repo_dir = create_test_repo();
        let out_dir = tempfile::TempDir::new().unwrap();
        let output_path = out_dir.path().join("train.jsonl");

        // dedup_cap=1 means each unique function content only produces 1 triplet
        let config = TrainDataConfig {
            repos: vec![repo_dir.path().to_path_buf()],
            output: output_path.clone(),
            max_commits: 0,
            min_msg_len: 10,
            max_files: 20,
            dedup_cap: 1,
            resume: false,
            verbose: false,
        };

        let stats_capped = generate_training_data(&config).unwrap();

        // Run again with high cap for comparison
        let output_path2 = out_dir.path().join("train2.jsonl");
        let config2 = TrainDataConfig {
            repos: vec![repo_dir.path().to_path_buf()],
            output: output_path2,
            max_commits: 0,
            min_msg_len: 10,
            max_files: 20,
            dedup_cap: 100,
            resume: false,
            verbose: false,
        };

        let stats_uncapped = generate_training_data(&config2).unwrap();

        assert!(
            stats_capped.total_triplets <= stats_uncapped.total_triplets,
            "Capped ({}) should be <= uncapped ({})",
            stats_capped.total_triplets,
            stats_uncapped.total_triplets
        );
    }
}