Skip to main content

aicx_parser/
segmentation.rs

1//! Semantic segmentation for canonical store ownership.
2//!
3//! Reconstructs repository-scoped session segments from content signals rather
4//! than weak source-side identifiers.
5//!
6//! Vibecrafted with AI Agents by VetCoders (c)2026 VetCoders
7
8use crate::timeline::{Kind, RepoIdentity, SemanticSegment, SourceTier, TimelineEntry};
9use regex::Regex;
10use serde::{Deserialize, Serialize};
11use std::collections::HashMap;
12use std::path::{Path, PathBuf};
13use std::process::Command;
14
15// ============================================================================
16// Source trust model
17// ============================================================================
18
19/// A repo identity paired with the trust tier of the signal that produced it.
20#[derive(Debug, Clone, PartialEq, Eq)]
21pub struct TieredIdentity {
22    pub identity: RepoIdentity,
23    pub tier: SourceTier,
24}
25
26/// Explicit source used when assigning an entry/session to a bucket.
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
28pub enum BucketingSource {
29    OperatorOverride,
30    CwdGitRemote,
31    CwdGitRoot,
32    KnownLayout,
33    Frontmatter,
34    ContentMention,
35    Unclassified,
36}
37
38#[derive(Debug, Clone, PartialEq, Eq)]
39pub struct BucketResolution {
40    pub bucket: String,
41    pub source: BucketingSource,
42    pub identity: Option<RepoIdentity>,
43}
44
45// ============================================================================
46// Gemini projectHash registry
47// ============================================================================
48
49/// Registry mapping Gemini `projectHash` values to known repo roots.
50///
51/// The mapping lives in `~/.aicx/gemini-project-map.json` and must be
52/// maintained by the user or by `aicx init`. A projectHash that is not
53/// in this file cannot resolve to a repo — it stays Opaque.
54#[derive(Debug, Clone, Default, Serialize, Deserialize)]
55pub struct ProjectHashRegistry {
56    /// Maps `projectHash` (hex string) → absolute path to project root.
57    #[serde(default)]
58    pub mappings: HashMap<String, String>,
59}
60
61impl ProjectHashRegistry {
62    /// Load from the default location (`~/.aicx/gemini-project-map.json`).
63    /// Returns an empty registry if the file doesn't exist or can't be parsed.
64    pub fn load_default() -> Self {
65        let Some(home) = std::env::var_os("HOME").map(PathBuf::from) else {
66            return Self::default();
67        };
68        let path = home.join(".aicx").join("gemini-project-map.json");
69        Self::load_from(&path)
70    }
71
72    /// Load from a specific path.
73    pub fn load_from(path: &Path) -> Self {
74        std::fs::read_to_string(path)
75            .ok()
76            .and_then(|content| serde_json::from_str(&content).ok())
77            .unwrap_or_default()
78    }
79
80    /// Resolve a projectHash to a `TieredIdentity` by looking up the mapped
81    /// path and then inferring repo identity from that path.
82    pub fn resolve(&self, project_hash: &str) -> Option<TieredIdentity> {
83        let root_path = self.mappings.get(project_hash)?;
84        let path = PathBuf::from(root_path);
85        let identity = infer_repo_identity_from_path(&path)?;
86        Some(TieredIdentity {
87            identity,
88            tier: SourceTier::Secondary,
89        })
90    }
91}
92
93pub fn semantic_segments(entries: &[TimelineEntry]) -> Vec<SemanticSegment> {
94    semantic_segments_with_registry(entries, &ProjectHashRegistry::default())
95}
96
97pub fn semantic_segments_with_registry(
98    entries: &[TimelineEntry],
99    registry: &ProjectHashRegistry,
100) -> Vec<SemanticSegment> {
101    let mut sessions: HashMap<(String, String), Vec<TimelineEntry>> = HashMap::new();
102    for entry in entries {
103        sessions
104            .entry((entry.agent.clone(), entry.session_id.clone()))
105            .or_default()
106            .push(entry.clone());
107    }
108
109    let mut ordered = Vec::new();
110
111    for ((agent, session_id), mut session_entries) in sessions {
112        session_entries.sort_by_key(|left| left.timestamp);
113
114        let mut current_tiered: Option<TieredIdentity> = None;
115        let mut current_entries: Vec<TimelineEntry> = Vec::new();
116
117        for entry in session_entries {
118            let explicit = infer_tiered_identity_from_entry(&entry, registry);
119
120            let explicit_repo = explicit.as_ref().map(|t| &t.identity);
121            let current_repo = current_tiered.as_ref().map(|t| &t.identity);
122
123            let split_for_first_truth =
124                !current_entries.is_empty() && current_repo.is_none() && explicit_repo.is_some();
125            let split_for_context_switch = !current_entries.is_empty()
126                && explicit_repo
127                    .zip(current_repo)
128                    .is_some_and(|(next_repo, active_repo)| next_repo != active_repo);
129
130            if split_for_first_truth || split_for_context_switch {
131                let tier = current_tiered.as_ref().map(|t| t.tier);
132                ordered.push(build_segment(
133                    current_tiered.take().map(|t| t.identity),
134                    tier,
135                    &agent,
136                    &session_id,
137                    std::mem::take(&mut current_entries),
138                ));
139            }
140
141            if current_entries.is_empty() {
142                current_tiered = explicit.clone();
143            }
144
145            if current_tiered.is_none() && explicit.is_some() {
146                current_tiered = explicit.clone();
147            }
148
149            current_entries.push(entry);
150        }
151
152        if !current_entries.is_empty() {
153            let tier = current_tiered.as_ref().map(|t| t.tier);
154            ordered.push(build_segment(
155                current_tiered.map(|t| t.identity),
156                tier,
157                &agent,
158                &session_id,
159                current_entries,
160            ));
161        }
162    }
163
164    ordered.sort_by(|left, right| {
165        left.entries
166            .first()
167            .map(|entry| entry.timestamp)
168            .cmp(&right.entries.first().map(|entry| entry.timestamp))
169            .then_with(|| left.agent.cmp(&right.agent))
170            .then_with(|| left.session_id.cmp(&right.session_id))
171    });
172
173    ordered
174}
175
176pub fn infer_repo_identity_from_entry(entry: &TimelineEntry) -> Option<RepoIdentity> {
177    infer_tiered_identity_from_entry(entry, &ProjectHashRegistry::default()).map(|t| t.identity)
178}
179
180pub fn resolve_bucket(entry: &TimelineEntry, registry: &ProjectHashRegistry) -> BucketResolution {
181    if let Some(tiered) = infer_tiered_identity_from_cwd(entry.cwd.as_deref()) {
182        let source = match tiered.tier {
183            SourceTier::Primary => BucketingSource::CwdGitRemote,
184            SourceTier::Secondary => BucketingSource::CwdGitRoot,
185            SourceTier::Fallback => BucketingSource::KnownLayout,
186            SourceTier::Opaque => BucketingSource::Unclassified,
187        };
188        return BucketResolution {
189            bucket: tiered.identity.slug(),
190            source,
191            identity: Some(tiered.identity),
192        };
193    }
194
195    if let Some(cwd) = entry.cwd.as_deref()
196        && looks_like_weak_source_identifier(cwd)
197        && let Some(tiered) = registry.resolve(cwd)
198    {
199        return BucketResolution {
200            bucket: tiered.identity.slug(),
201            source: BucketingSource::KnownLayout,
202            identity: Some(tiered.identity),
203        };
204    }
205
206    if let Some(tiered) = infer_tiered_identity_from_text(&entry.message) {
207        return BucketResolution {
208            bucket: tiered.identity.slug(),
209            source: BucketingSource::ContentMention,
210            identity: Some(tiered.identity),
211        };
212    }
213
214    BucketResolution {
215        bucket: "unclassified".to_string(),
216        source: BucketingSource::Unclassified,
217        identity: None,
218    }
219}
220
221/// Infer repo identity with explicit trust tier from all available signals.
222///
223/// Signal precedence (highest to lowest):
224/// 1. CWD that resolves via local git + remote -> Primary
225/// 2. CWD that resolves via local git + known layout/basename -> Secondary
226/// 3. CWD via known layout (no .git) -> Fallback
227/// 4. ProjectHash resolved through registry -> Secondary
228/// 5. Content mentions -> Fallback tags-only signal
229/// 6. Pure hex hash CWD / opaque -> Opaque (returns None)
230pub fn infer_tiered_identity_from_entry(
231    entry: &TimelineEntry,
232    registry: &ProjectHashRegistry,
233) -> Option<TieredIdentity> {
234    if let Some(tiered) = infer_tiered_identity_from_cwd(entry.cwd.as_deref()) {
235        return Some(tiered);
236    }
237
238    // Last resort: try projectHash registry for Gemini sessions.
239    // The cwd field for Gemini sessions is often the projectHash itself.
240    if let Some(cwd) = entry.cwd.as_deref()
241        && looks_like_weak_source_identifier(cwd)
242    {
243        return registry.resolve(cwd);
244    }
245
246    infer_tiered_identity_from_text(&entry.message)
247}
248
249/// Classify a raw CWD string into a source tier without resolving identity.
250pub fn classify_cwd_tier(cwd: Option<&str>) -> SourceTier {
251    let Some(raw) = cwd else {
252        return SourceTier::Opaque;
253    };
254    let trimmed = raw.trim();
255    if trimmed.is_empty() {
256        return SourceTier::Opaque;
257    }
258    if looks_like_weak_source_identifier(trimmed) {
259        return SourceTier::Opaque;
260    }
261    let path = expand_home(trimmed);
262    if discover_git_root(&path).is_some() {
263        return SourceTier::Secondary;
264    }
265    if infer_repo_identity_from_known_layout(&path).is_some() {
266        return SourceTier::Fallback;
267    }
268    SourceTier::Opaque
269}
270
271fn build_segment(
272    repo: Option<RepoIdentity>,
273    source_tier: Option<SourceTier>,
274    agent: &str,
275    session_id: &str,
276    entries: Vec<TimelineEntry>,
277) -> SemanticSegment {
278    let kind = classify_segment_kind(&entries);
279    SemanticSegment {
280        repo,
281        source_tier,
282        kind,
283        agent: agent.to_string(),
284        session_id: session_id.to_string(),
285        entries,
286    }
287}
288
289fn classify_segment_kind(entries: &[TimelineEntry]) -> Kind {
290    if entries.is_empty() {
291        return Kind::Other;
292    }
293
294    let has_conversation = entries
295        .iter()
296        .any(|entry| entry.role == "user" || entry.role == "assistant");
297
298    let report_score = entries
299        .iter()
300        .map(|entry| classify_report_signal(entry.message.as_str()))
301        .sum::<u8>();
302    let plan_score = entries
303        .iter()
304        .map(|entry| classify_plan_signal(entry.message.as_str()))
305        .sum::<u8>();
306
307    if report_score >= 2 && report_score > plan_score && !has_conversation {
308        Kind::Reports
309    } else if plan_score >= 2 && plan_score >= report_score {
310        Kind::Plans
311    } else if has_conversation {
312        Kind::Conversations
313    } else if report_score > 0 {
314        Kind::Reports
315    } else {
316        Kind::Other
317    }
318}
319
320fn classify_plan_signal(message: &str) -> u8 {
321    let lower = message.to_ascii_lowercase();
322    u8::from(lower.contains("goal:"))
323        + u8::from(lower.contains("acceptance:"))
324        + u8::from(lower.contains("test gate:"))
325        + u8::from(lower.contains("- [ ]"))
326        + u8::from(lower.contains("plan:"))
327        + u8::from(lower.contains("migration plan"))
328}
329
330fn classify_report_signal(message: &str) -> u8 {
331    let lower = message.to_ascii_lowercase();
332    u8::from(lower.contains("recovery report"))
333        + u8::from(lower.contains("audit report"))
334        + u8::from(lower.contains("coverage report"))
335        + u8::from(lower.contains("status report"))
336        + u8::from(lower.contains("summary"))
337}
338
339fn infer_repo_identity_from_path(path: &Path) -> Option<RepoIdentity> {
340    if let Some(repo) = infer_repo_identity_from_local_git(path) {
341        return Some(repo);
342    }
343
344    infer_repo_identity_from_known_layout(path)
345}
346
347// ── Tiered inference helpers ──────────────────────────────────────────────
348
349fn infer_tiered_identity_from_text(text: &str) -> Option<TieredIdentity> {
350    // Content mentions remain queryable hints, but they must not assert
351    // canonical store ownership over cwd/git truth.
352    if let Some(identity) = infer_repo_identity_from_remote_like(text) {
353        return Some(TieredIdentity {
354            identity,
355            tier: SourceTier::Fallback,
356        });
357    }
358
359    // Path in text → tier depends on how it resolves
360    let path_re = Regex::new(r"(/[A-Za-z0-9._~\-]+(?:/[A-Za-z0-9._~\-]+)+)").ok()?;
361    for capture in path_re.captures_iter(text) {
362        let raw = capture.get(1)?.as_str();
363        let path = PathBuf::from(raw);
364        if let Some(tiered) = infer_tiered_identity_from_path(&path) {
365            return Some(tiered);
366        }
367    }
368
369    None
370}
371
372fn infer_tiered_identity_from_cwd(cwd: Option<&str>) -> Option<TieredIdentity> {
373    let cwd = cwd?.trim();
374    if cwd.is_empty() || looks_like_weak_source_identifier(cwd) {
375        return None;
376    }
377
378    // Remote-like CWD → Primary
379    if let Some(identity) = infer_repo_identity_from_remote_like(cwd) {
380        return Some(TieredIdentity {
381            identity,
382            tier: SourceTier::Primary,
383        });
384    }
385
386    let path = expand_home(cwd);
387    infer_tiered_identity_from_path(&path)
388}
389
390fn infer_tiered_identity_from_path(path: &Path) -> Option<TieredIdentity> {
391    // Local git with remote → Primary
392    if let Some(repo_root) = discover_git_root(path) {
393        if let Some(identity) = infer_repo_identity_from_git_remote(&repo_root) {
394            return Some(TieredIdentity {
395                identity,
396                tier: SourceTier::Primary,
397            });
398        }
399        // Local git with known layout → Secondary
400        if let Some(identity) = infer_repo_identity_from_known_layout(&repo_root) {
401            return Some(TieredIdentity {
402                identity,
403                tier: SourceTier::Secondary,
404            });
405        }
406        // Local git, basename only → Secondary
407        if let Some(name) = repo_root.file_name() {
408            return Some(TieredIdentity {
409                identity: RepoIdentity {
410                    organization: "local".to_string(),
411                    repository: name.to_string_lossy().to_string(),
412                },
413                tier: SourceTier::Secondary,
414            });
415        }
416    }
417
418    // Known layout without .git → Fallback
419    if let Some(identity) = infer_repo_identity_from_known_layout(path) {
420        return Some(TieredIdentity {
421            identity,
422            tier: SourceTier::Fallback,
423        });
424    }
425
426    None
427}
428
429fn infer_repo_identity_from_local_git(path: &Path) -> Option<RepoIdentity> {
430    let repo_root = discover_git_root(path)?;
431    infer_repo_identity_from_git_remote(&repo_root)
432        .or_else(|| infer_repo_identity_from_known_layout(&repo_root))
433        .or_else(|| {
434            repo_root.file_name().map(|name| RepoIdentity {
435                organization: "local".to_string(),
436                repository: name.to_string_lossy().to_string(),
437            })
438        })
439}
440
441fn discover_git_root(path: &Path) -> Option<PathBuf> {
442    let seed = if path.is_file() {
443        path.parent()?.to_path_buf()
444    } else {
445        path.to_path_buf()
446    };
447
448    seed.ancestors()
449        .find(|candidate| candidate.join(".git").exists())
450        .map(Path::to_path_buf)
451}
452
453fn infer_repo_identity_from_git_remote(repo_root: &Path) -> Option<RepoIdentity> {
454    let output = Command::new("git")
455        .arg("-C")
456        .arg(repo_root)
457        .args(["remote", "get-url", "origin"])
458        .output()
459        .ok()?;
460
461    if !output.status.success() {
462        return None;
463    }
464
465    let remote = String::from_utf8_lossy(&output.stdout);
466    infer_repo_identity_from_remote_like(remote.trim())
467}
468
469fn infer_repo_identity_from_known_layout(path: &Path) -> Option<RepoIdentity> {
470    let components: Vec<String> = path
471        .components()
472        .map(|component| component.as_os_str().to_string_lossy().to_string())
473        .collect();
474
475    for marker in ["hosted", "repos", "repositories", "github", "git"] {
476        let marker_index = components
477            .iter()
478            .position(|component| component == marker)?;
479        if components.len() > marker_index + 2 {
480            let organization = components[marker_index + 1].clone();
481            let repository = components[marker_index + 2].clone();
482            if is_probably_repo_name(&organization) && is_probably_repo_name(&repository) {
483                return Some(RepoIdentity {
484                    organization,
485                    repository,
486                });
487            }
488        }
489    }
490
491    None
492}
493
494fn infer_repo_identity_from_remote_like(raw: &str) -> Option<RepoIdentity> {
495    for token in raw.split_whitespace() {
496        let trimmed = token
497            .trim_matches(|ch: char| matches!(ch, '"' | '\'' | ',' | '.' | ')' | '(' | '[' | ']'));
498        for prefix in [
499            "https://github.com/",
500            "http://github.com/",
501            "https://gitlab.com/",
502            "http://gitlab.com/",
503            "git@github.com:",
504            "git@gitlab.com:",
505        ] {
506            if let Some(rest) = trimmed.strip_prefix(prefix)
507                && let Some(repo) = repo_identity_from_remote_path(rest)
508            {
509                return Some(repo);
510            }
511        }
512    }
513
514    None
515}
516
517fn repo_identity_from_remote_path(path: &str) -> Option<RepoIdentity> {
518    let mut parts = path.split('/');
519    let organization = parts.next()?.trim();
520    let repository = parts.next()?.trim().trim_end_matches(".git");
521
522    if is_probably_repo_name(organization) && is_probably_repo_name(repository) {
523        return Some(RepoIdentity {
524            organization: organization.to_string(),
525            repository: repository.to_string(),
526        });
527    }
528
529    Some(RepoIdentity {
530        organization: "local".to_string(),
531        repository: local_repo_fallback(repository),
532    })
533}
534
535fn local_repo_fallback(repository: &str) -> String {
536    if is_probably_repo_name(repository) {
537        repository.to_string()
538    } else {
539        "unknown".to_string()
540    }
541}
542
543fn looks_like_weak_source_identifier(raw: &str) -> bool {
544    let trimmed = raw.trim();
545    trimmed.len() >= 16
546        && trimmed.chars().all(|ch| ch.is_ascii_hexdigit())
547        && !trimmed.contains('/')
548        && !trimmed.contains(':')
549}
550
551fn expand_home(raw: &str) -> PathBuf {
552    if let Some(rest) = raw.strip_prefix("~/")
553        && let Some(home) = std::env::var_os("HOME").map(PathBuf::from)
554    {
555        return home.join(rest);
556    }
557
558    PathBuf::from(raw)
559}
560
561fn is_probably_repo_name(value: &str) -> bool {
562    if value.is_empty() || value.len() > 64 {
563        return false;
564    }
565
566    let mut chars = value.chars();
567    let Some(first) = chars.next() else {
568        return false;
569    };
570    if !first.is_ascii_alphanumeric() {
571        return false;
572    }
573    if !chars.all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '.' | '-' | '_')) {
574        return false;
575    }
576
577    let lower = value.to_ascii_lowercase();
578    if matches!(
579        lower.as_str(),
580        "." | ".."
581            | "..."
582            | "local"
583            | "tmp"
584            | "temp"
585            | "src"
586            | "app"
587            | "lib"
588            | "docs"
589            | "workspace"
590            | "workspaces"
591    ) {
592        return false;
593    }
594
595    let dot_count = value.chars().filter(|ch| *ch == '.').count();
596    if dot_count > value.chars().count() / 2 {
597        return false;
598    }
599
600    true
601}
602
603#[cfg(test)]
604mod tests {
605    use super::*;
606    use chrono::{TimeZone, Utc};
607    use std::fs;
608
609    fn entry(
610        ts: (i32, u32, u32, u32, u32, u32),
611        session_id: &str,
612        role: &str,
613        message: &str,
614        cwd: Option<&str>,
615    ) -> TimelineEntry {
616        TimelineEntry {
617            timestamp: Utc
618                .with_ymd_and_hms(ts.0, ts.1, ts.2, ts.3, ts.4, ts.5)
619                .unwrap(),
620            agent: "claude".to_string(),
621            session_id: session_id.to_string(),
622            role: role.to_string(),
623            message: message.to_string(),
624            branch: None,
625            cwd: cwd.map(ToOwned::to_owned),
626            frame_kind: None,
627        }
628    }
629
630    fn mk_tmp_dir(name: &str) -> PathBuf {
631        std::env::temp_dir().join(format!(
632            "ai-contexters-segmentation-{name}-{}-{}",
633            std::process::id(),
634            Utc::now().timestamp_nanos_opt().unwrap_or_default()
635        ))
636    }
637
638    #[test]
639    fn repo_signal_segmentation_splits_one_session_across_multiple_repositories() {
640        let entries = vec![
641            entry(
642                (2026, 3, 21, 9, 0, 0),
643                "sess-1",
644                "user",
645                "Please inspect https://github.com/VetCoders/ai-contexters before editing.",
646                None,
647            ),
648            entry(
649                (2026, 3, 21, 9, 1, 0),
650                "sess-1",
651                "assistant",
652                "I found the store seam in ai-contexters.",
653                None,
654            ),
655            entry(
656                (2026, 3, 21, 9, 2, 0),
657                "sess-1",
658                "user",
659                "Switch now to https://github.com/VetCoders/loctree and review the scanner.",
660                None,
661            ),
662            entry(
663                (2026, 3, 21, 9, 3, 0),
664                "sess-1",
665                "assistant",
666                "I am reviewing loctree next.",
667                None,
668            ),
669        ];
670
671        let segments = semantic_segments(&entries);
672        assert_eq!(segments.len(), 2);
673        assert_eq!(segments[0].project_label(), "VetCoders/ai-contexters");
674        assert_eq!(segments[1].project_label(), "VetCoders/loctree");
675    }
676
677    #[test]
678    fn repo_signal_segmentation_keeps_unknown_prefix_honest() {
679        let entries = vec![
680            entry(
681                (2026, 3, 21, 9, 0, 0),
682                "sess-2",
683                "user",
684                "Need a migration plan but I have not named the repo yet.",
685                None,
686            ),
687            entry(
688                (2026, 3, 21, 9, 1, 0),
689                "sess-2",
690                "assistant",
691                "Drafting a migration plan with acceptance criteria.",
692                None,
693            ),
694            entry(
695                (2026, 3, 21, 9, 2, 0),
696                "sess-2",
697                "user",
698                "The actual repo is https://github.com/VetCoders/ai-contexters.",
699                None,
700            ),
701        ];
702
703        let segments = semantic_segments(&entries);
704        assert_eq!(segments.len(), 2);
705        assert!(segments[0].repo.is_none());
706        assert_eq!(segments[0].kind, Kind::Plans);
707        assert_eq!(segments[1].project_label(), "VetCoders/ai-contexters");
708    }
709
710    #[test]
711    fn repo_signal_segmentation_ignores_gemini_hash_like_cwd() {
712        let entry = entry(
713            (2026, 3, 21, 9, 0, 0),
714            "sess-3",
715            "user",
716            "No trustworthy repo here.",
717            Some("57cfd37b3a72d995c4f2d018ebf9d5a2"),
718        );
719
720        assert!(infer_repo_identity_from_entry(&entry).is_none());
721        let segments = semantic_segments(&[entry]);
722        assert_eq!(segments.len(), 1);
723        assert!(segments[0].repo.is_none());
724    }
725
726    #[test]
727    fn repo_signal_segmentation_uses_local_git_remote_when_available() {
728        let root = mk_tmp_dir("git-remote");
729        let repo = root.join("hosted").join("VetCoders").join("ai-contexters");
730        fs::create_dir_all(&repo).unwrap();
731
732        Command::new("git")
733            .arg("init")
734            .arg(&repo)
735            .output()
736            .expect("git init should run");
737        Command::new("git")
738            .arg("-C")
739            .arg(&repo)
740            .args([
741                "remote",
742                "add",
743                "origin",
744                "git@github.com:VetCoders/ai-contexters.git",
745            ])
746            .output()
747            .expect("git remote add should run");
748
749        let entry = entry(
750            (2026, 3, 21, 9, 0, 0),
751            "sess-4",
752            "user",
753            "Inspect the repo on disk.",
754            Some(repo.to_string_lossy().as_ref()),
755        );
756
757        let repo_identity = infer_repo_identity_from_entry(&entry).expect("repo identity");
758        assert_eq!(repo_identity.slug(), "VetCoders/ai-contexters");
759
760        let _ = fs::remove_dir_all(&root);
761    }
762
763    // ================================================================
764    // Source tier tests
765    // ================================================================
766
767    #[test]
768    fn source_tier_github_url_is_primary() {
769        let e = entry(
770            (2026, 3, 22, 10, 0, 0),
771            "sess-tier",
772            "user",
773            "Check https://github.com/VetCoders/ai-contexters for updates.",
774            None,
775        );
776        let tiered = infer_tiered_identity_from_entry(&e, &ProjectHashRegistry::default())
777            .expect("should resolve");
778        assert_eq!(tiered.tier, SourceTier::Fallback);
779        assert_eq!(tiered.identity.slug(), "VetCoders/ai-contexters");
780        assert!(!tiered.tier.is_assertable());
781    }
782
783    #[test]
784    fn cwd_git_identity_wins_over_content_mentions() {
785        let root = mk_tmp_dir("cwd-wins");
786        let repo = root.join("Git").join("vista");
787        fs::create_dir_all(&repo).unwrap();
788
789        Command::new("git")
790            .arg("init")
791            .arg(&repo)
792            .output()
793            .expect("git init");
794
795        let e = entry(
796            (2026, 5, 6, 10, 0, 0),
797            "sess-cwd-wins",
798            "user",
799            "We need to inspect https://github.com/RustCrypto/RSA while working locally.",
800            Some(repo.to_string_lossy().as_ref()),
801        );
802
803        let resolution = resolve_bucket(&e, &ProjectHashRegistry::default());
804        assert_eq!(resolution.bucket, "local/vista");
805        assert_eq!(resolution.source, BucketingSource::CwdGitRoot);
806
807        let _ = fs::remove_dir_all(&root);
808    }
809
810    #[test]
811    fn rejects_template_literals() {
812        assert!(!is_probably_repo_name("{target_owner}"));
813        assert!(!is_probably_repo_name("<YOUR_USERNAME>"));
814        assert!(!is_probably_repo_name("${RELEASE_REPO}"));
815        assert!(!is_probably_repo_name("$REPO"));
816        assert!(!is_probably_repo_name("{org}"));
817    }
818
819    #[test]
820    fn rejects_dot_only_and_traversal_strings() {
821        assert!(!is_probably_repo_name("..."));
822        assert!(!is_probably_repo_name(".."));
823        assert!(!is_probably_repo_name("."));
824        assert!(!is_probably_repo_name(".../"));
825        assert!(!is_probably_repo_name("..hidden"));
826    }
827
828    #[test]
829    fn rejects_control_chars_and_separators() {
830        assert!(!is_probably_repo_name("foo/bar"));
831        assert!(!is_probably_repo_name("foo\\bar"));
832        assert!(!is_probably_repo_name("foo\nbar"));
833        assert!(!is_probably_repo_name("foo bar"));
834        assert!(!is_probably_repo_name(""));
835    }
836
837    #[test]
838    fn accepts_real_repo_names() {
839        assert!(is_probably_repo_name("vibecrafted"));
840        assert!(is_probably_repo_name("rust-memex"));
841        assert!(is_probably_repo_name("ai-contexters"));
842        assert!(is_probably_repo_name("vc-runtime"));
843        assert!(is_probably_repo_name("CodeScribe"));
844        assert!(is_probably_repo_name("starship"));
845        assert!(is_probably_repo_name("01mf02"));
846        assert!(is_probably_repo_name("a"));
847    }
848
849    #[test]
850    fn fallback_routes_invalid_remote_owner_to_local_bucket() {
851        let e = entry(
852            (2026, 3, 22, 10, 0, 0),
853            "sess-local-fallback",
854            "user",
855            "Clone https://github.com/{target_owner}/vibecrafted.git before release.",
856            None,
857        );
858
859        let tiered = infer_tiered_identity_from_entry(&e, &ProjectHashRegistry::default())
860            .expect("malformed remote should resolve to local fallback");
861        assert_eq!(tiered.identity.slug(), "local/vibecrafted");
862        assert!(!tiered.tier.is_assertable());
863
864        let segments = semantic_segments(&[e]);
865        assert_eq!(segments.len(), 1);
866        assert_eq!(segments[0].project_label(), "local/vibecrafted");
867        assert_ne!(segments[0].project_label(), "{target_owner}/vibecrafted");
868    }
869
870    #[test]
871    fn fallback_routes_invalid_remote_repo_to_unknown_local_bucket() {
872        let identity = infer_repo_identity_from_remote_like(
873            "https://github.com/VetCoders/${RELEASE_REPO}.git",
874        )
875        .expect("malformed repository should resolve to local unknown fallback");
876
877        assert_eq!(identity.slug(), "local/unknown");
878    }
879
880    #[test]
881    fn source_tier_git_remote_cwd_is_primary() {
882        let root = mk_tmp_dir("tier-git-remote");
883        let repo = root.join("hosted").join("VetCoders").join("loctree");
884        fs::create_dir_all(&repo).unwrap();
885
886        Command::new("git")
887            .arg("init")
888            .arg(&repo)
889            .output()
890            .expect("git init");
891        Command::new("git")
892            .arg("-C")
893            .arg(&repo)
894            .args([
895                "remote",
896                "add",
897                "origin",
898                "git@github.com:VetCoders/loctree.git",
899            ])
900            .output()
901            .expect("git remote add");
902
903        let e = entry(
904            (2026, 3, 22, 10, 0, 0),
905            "sess-tier-git",
906            "user",
907            "Working in the repo.",
908            Some(repo.to_string_lossy().as_ref()),
909        );
910
911        let tiered = infer_tiered_identity_from_entry(&e, &ProjectHashRegistry::default())
912            .expect("should resolve");
913        assert_eq!(tiered.tier, SourceTier::Primary);
914        assert_eq!(tiered.identity.slug(), "VetCoders/loctree");
915
916        let _ = fs::remove_dir_all(&root);
917    }
918
919    #[test]
920    fn source_tier_known_layout_without_git_is_fallback() {
921        let e = entry(
922            (2026, 3, 22, 10, 0, 0),
923            "sess-tier-layout",
924            "user",
925            "Working at /nonexistent/hosted/SomeOrg/SomeRepo",
926            None,
927        );
928        let tiered = infer_tiered_identity_from_entry(&e, &ProjectHashRegistry::default());
929        // Path in message text resolved via known layout (no .git) → Fallback
930        if let Some(t) = tiered {
931            assert_eq!(t.tier, SourceTier::Fallback);
932            assert!(!t.tier.is_assertable());
933        }
934        // It's also OK if it returns None (path doesn't exist on disk)
935    }
936
937    #[test]
938    fn source_tier_hex_hash_cwd_is_opaque_without_registry() {
939        let e = entry(
940            (2026, 3, 22, 10, 0, 0),
941            "sess-tier-hash",
942            "user",
943            "Hello from Gemini.",
944            Some("fef6ad02174d592d21e7f8a6143564388027ec0c"),
945        );
946        let tiered = infer_tiered_identity_from_entry(&e, &ProjectHashRegistry::default());
947        assert!(
948            tiered.is_none(),
949            "hex hash without registry must not resolve"
950        );
951    }
952
953    #[test]
954    fn source_tier_hex_hash_resolves_through_registry() {
955        let root = mk_tmp_dir("tier-registry");
956        let repo = root.join("hosted").join("VetCoders").join("ai-contexters");
957        fs::create_dir_all(&repo).unwrap();
958
959        Command::new("git")
960            .arg("init")
961            .arg(&repo)
962            .output()
963            .expect("git init");
964        Command::new("git")
965            .arg("-C")
966            .arg(&repo)
967            .args([
968                "remote",
969                "add",
970                "origin",
971                "git@github.com:VetCoders/ai-contexters.git",
972            ])
973            .output()
974            .expect("git remote add");
975
976        let mut registry = ProjectHashRegistry::default();
977        registry.mappings.insert(
978            "fef6ad02174d592d21e7f8a6143564388027ec0c".to_string(),
979            repo.to_string_lossy().to_string(),
980        );
981
982        let e = entry(
983            (2026, 3, 22, 10, 0, 0),
984            "sess-tier-reg",
985            "user",
986            "Hello from Gemini.",
987            Some("fef6ad02174d592d21e7f8a6143564388027ec0c"),
988        );
989
990        let tiered =
991            infer_tiered_identity_from_entry(&e, &registry).expect("registry should resolve");
992        assert_eq!(tiered.tier, SourceTier::Secondary);
993        assert_eq!(tiered.identity.slug(), "VetCoders/ai-contexters");
994        assert!(tiered.tier.is_assertable());
995
996        let _ = fs::remove_dir_all(&root);
997    }
998
999    #[test]
1000    fn source_tier_registry_with_unknown_hash_returns_none() {
1001        let registry = ProjectHashRegistry::default();
1002        let e = entry(
1003            (2026, 3, 22, 10, 0, 0),
1004            "sess-tier-unknown",
1005            "user",
1006            "Hello from Gemini.",
1007            Some("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
1008        );
1009        let tiered = infer_tiered_identity_from_entry(&e, &registry);
1010        assert!(
1011            tiered.is_none(),
1012            "unknown hash must not resolve even with empty registry"
1013        );
1014    }
1015
1016    #[test]
1017    fn source_tier_classify_cwd_empty_is_opaque() {
1018        assert_eq!(classify_cwd_tier(None), SourceTier::Opaque);
1019        assert_eq!(classify_cwd_tier(Some("")), SourceTier::Opaque);
1020    }
1021
1022    #[test]
1023    fn source_tier_classify_cwd_hex_is_opaque() {
1024        assert_eq!(
1025            classify_cwd_tier(Some("57cfd37b3a72d995c4f2d018ebf9d5a2")),
1026            SourceTier::Opaque
1027        );
1028    }
1029
1030    #[test]
1031    fn segments_carry_source_tier() {
1032        let entries = vec![
1033            entry(
1034                (2026, 3, 22, 10, 0, 0),
1035                "sess-st",
1036                "user",
1037                "Check https://github.com/VetCoders/ai-contexters",
1038                None,
1039            ),
1040            entry(
1041                (2026, 3, 22, 10, 1, 0),
1042                "sess-st",
1043                "assistant",
1044                "Reviewing now.",
1045                None,
1046            ),
1047        ];
1048
1049        let segments = semantic_segments(&entries);
1050        assert_eq!(segments.len(), 1);
1051        assert_eq!(segments[0].source_tier, Some(SourceTier::Fallback));
1052        assert!(!segments[0].has_assertable_identity());
1053    }
1054
1055    #[test]
1056    fn segments_without_repo_have_no_tier() {
1057        let entries = vec![entry(
1058            (2026, 3, 22, 10, 0, 0),
1059            "sess-none",
1060            "user",
1061            "Just chatting, no repo context.",
1062            None,
1063        )];
1064
1065        let segments = semantic_segments(&entries);
1066        assert_eq!(segments.len(), 1);
1067        assert!(segments[0].repo.is_none());
1068        assert!(segments[0].source_tier.is_none());
1069        assert!(!segments[0].has_assertable_identity());
1070    }
1071
1072    #[test]
1073    fn segments_opaque_cwd_routes_to_non_repo() {
1074        let entries = vec![entry(
1075            (2026, 3, 22, 10, 0, 0),
1076            "sess-opaque",
1077            "user",
1078            "Gemini session with opaque hash only.",
1079            Some("fef6ad02174d592d21e7f8a6143564388027ec0c"),
1080        )];
1081
1082        let segments = semantic_segments(&entries);
1083        assert_eq!(segments.len(), 1);
1084        assert!(segments[0].repo.is_none());
1085        assert_eq!(segments[0].project_label(), "non-repository-contexts");
1086    }
1087
1088    #[test]
1089    fn segments_opaque_cwd_resolves_with_registry() {
1090        let root = mk_tmp_dir("seg-registry");
1091        let repo = root.join("hosted").join("VetCoders").join("ai-contexters");
1092        fs::create_dir_all(&repo).unwrap();
1093
1094        Command::new("git")
1095            .arg("init")
1096            .arg(&repo)
1097            .output()
1098            .expect("git init");
1099        Command::new("git")
1100            .arg("-C")
1101            .arg(&repo)
1102            .args([
1103                "remote",
1104                "add",
1105                "origin",
1106                "git@github.com:VetCoders/ai-contexters.git",
1107            ])
1108            .output()
1109            .expect("git remote add");
1110
1111        let mut registry = ProjectHashRegistry::default();
1112        registry.mappings.insert(
1113            "fef6ad02174d592d21e7f8a6143564388027ec0c".to_string(),
1114            repo.to_string_lossy().to_string(),
1115        );
1116
1117        let entries = vec![entry(
1118            (2026, 3, 22, 10, 0, 0),
1119            "sess-reg",
1120            "user",
1121            "Gemini session with mapped hash.",
1122            Some("fef6ad02174d592d21e7f8a6143564388027ec0c"),
1123        )];
1124
1125        let segments = semantic_segments_with_registry(&entries, &registry);
1126        assert_eq!(segments.len(), 1);
1127        assert!(segments[0].repo.is_some());
1128        assert_eq!(segments[0].source_tier, Some(SourceTier::Secondary));
1129        assert_eq!(segments[0].project_label(), "VetCoders/ai-contexters");
1130
1131        let _ = fs::remove_dir_all(&root);
1132    }
1133
1134    #[test]
1135    fn project_hash_registry_roundtrip() {
1136        let root = mk_tmp_dir("registry-roundtrip");
1137        fs::create_dir_all(&root).unwrap();
1138        let path = root.join("gemini-project-map.json");
1139
1140        let mut registry = ProjectHashRegistry::default();
1141        registry.mappings.insert(
1142            "abc123".to_string(),
1143            "/home/user/repos/my-project".to_string(),
1144        );
1145
1146        let json = serde_json::to_string_pretty(&registry).unwrap();
1147        fs::write(&path, &json).unwrap();
1148
1149        let loaded = ProjectHashRegistry::load_from(&path);
1150        assert_eq!(loaded.mappings.len(), 1);
1151        assert_eq!(
1152            loaded.mappings.get("abc123").map(String::as_str),
1153            Some("/home/user/repos/my-project")
1154        );
1155
1156        let _ = fs::remove_dir_all(&root);
1157    }
1158
1159    #[test]
1160    fn project_hash_registry_missing_file_returns_empty() {
1161        let registry = ProjectHashRegistry::load_from(Path::new("/nonexistent/path.json"));
1162        assert!(registry.mappings.is_empty());
1163    }
1164
1165    #[test]
1166    fn source_tier_ordering() {
1167        assert!(SourceTier::Primary < SourceTier::Secondary);
1168        assert!(SourceTier::Secondary < SourceTier::Fallback);
1169        assert!(SourceTier::Fallback < SourceTier::Opaque);
1170    }
1171
1172    #[test]
1173    fn source_tier_assertable_boundaries() {
1174        assert!(SourceTier::Primary.is_assertable());
1175        assert!(SourceTier::Secondary.is_assertable());
1176        assert!(!SourceTier::Fallback.is_assertable());
1177        assert!(!SourceTier::Opaque.is_assertable());
1178    }
1179}