Skip to main content

sqry_core/workspace/
logical.rs

1//! Logical workspace data model.
2//!
3//! A `LogicalWorkspace` is the unit of identity for cross-repo / workspace-aware
4//! indexing. It carries:
5//!
6//! - The `WorkspaceIdentity` from which a stable `WorkspaceId` (BLAKE3-256
7//!   digest) is derived.
8//! - A list of `SourceRoot`s — directories that are auto-indexed and queried.
9//! - A list of `MemberFolder`s — directories that are part of the workspace
10//!   but **not** auto-indexed (they fall through to the workspace's source
11//!   roots when queried).
12//! - An explicit `exclusions` list — paths that are opaque to sqry.
13//! - Workspace-scoped metadata: `ProjectRootMode`, optional
14//!   `index_root_override`, and a `config_fingerprint` placeholder
15//!   (populated by the plugin-selection / cost-tier pipeline in a later
16//!   step).
17//!
18//! Exhaustive design + identity rules: see
19//! `docs/development/workspace-aware-cross-repo/03_IMPLEMENTATION_PLAN.md` §1.
20//!
21//! ## Identity-input canonicalization (deterministic)
22//!
23//! 1. Every path is funneled through
24//!    [`crate::project::path_utils::canonicalize_path`], which resolves
25//!    symlinks via `realpath(3)` when possible, falling back to lexical
26//!    absolutization when the target does not exist.
27//! 2. When canonicalization falls back (i.e. the path could not be resolved
28//!    against the filesystem), the surrounding identity records the fact
29//!    via a `symlink_unresolved: bool`. The flag is part of the hash input,
30//!    so a missing-then-existing path will produce a different
31//!    `WorkspaceId` (this is correct — the shape of the workspace changed).
32//! 3. On case-insensitive mounts (best-effort detected per path) paths are
33//!    lowercased before hashing, so case variants resolve to the same
34//!    `WorkspaceId`. On case-sensitive filesystems (default on Linux) the
35//!    detection returns `false` and paths are hashed verbatim.
36//! 4. `AnonymousMultiRoot.folders` is sorted lexically before hashing so a
37//!    reorder of workspace folders is identity-preserving.
38//! 5. `config_fingerprint` is **not** included in the hash — it is a
39//!    separate cache dimension.
40//!
41//! ## Member vs Excluded — the contract
42//!
43//! - `Source` paths and their descendants are owned by the source root.
44//! - `Member` paths are part of the logical workspace but not auto-indexed.
45//!   Reads still resolve via the workspace's source roots; status returns
46//!   the *aggregate* workspace status.
47//! - `Excluded` paths are opaque — searches return empty with an explicit
48//!   `excluded` flag.
49//! - `Unknown` paths sit outside the workspace entirely.
50
51use std::collections::BTreeMap;
52use std::fs;
53use std::io;
54use std::path::{Path, PathBuf};
55
56use blake3::Hasher;
57use serde::{Deserialize, Serialize};
58use thiserror::Error;
59
60use crate::project::path_utils::canonicalize_path;
61use crate::project::types::ProjectRootMode;
62
63use super::registry::WorkspaceRegistry;
64
65// ---------------------------------------------------------------------------
66// Errors
67// ---------------------------------------------------------------------------
68
69/// Errors produced while constructing a [`LogicalWorkspace`].
70#[derive(Debug, Error)]
71pub enum LogicalWorkspaceError {
72    /// Generic IO failure that is not tied to a specific path.
73    #[error("io error: {0}")]
74    Io(#[from] io::Error),
75
76    /// A path could not be canonicalized.
77    #[error("failed to canonicalize {path}: {source}")]
78    Canonicalization {
79        /// The path that failed to canonicalize.
80        path: PathBuf,
81        /// Underlying IO error.
82        source: io::Error,
83    },
84
85    /// The legacy `.sqry-workspace` (registry v1) JSON could not be parsed.
86    #[error("failed to parse .sqry-workspace registry: {0}")]
87    ParseSqryWorkspace(serde_json::Error),
88
89    /// The `.code-workspace` JSON could not be parsed.
90    #[error("failed to parse .code-workspace file: {0}")]
91    ParseCodeWorkspace(serde_json::Error),
92
93    /// A `folders[i]` entry in a `.code-workspace` is malformed.
94    #[error("malformed .code-workspace folder entry: {reason}")]
95    MalformedFolderEntry {
96        /// Human-readable reason describing the malformed entry.
97        reason: String,
98    },
99
100    /// The same path was classified into two conflicting roles.
101    #[error("conflicting classification for {path}: {kinds}")]
102    ConflictingClassification {
103        /// The path with conflicting classifications.
104        path: PathBuf,
105        /// A description of the conflicting kinds.
106        kinds: String,
107    },
108}
109
110// ---------------------------------------------------------------------------
111// WorkspaceId — BLAKE3-256 typed digest
112// ---------------------------------------------------------------------------
113
114/// Stable identity for a [`LogicalWorkspace`].
115///
116/// 32 bytes (BLAKE3-256) over the canonicalized identity inputs.
117/// Never truncated to 64 bits — the full 256-bit space is used to keep the
118/// collision probability astronomically small across processes / caches.
119#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
120pub struct WorkspaceId([u8; 32]);
121
122impl WorkspaceId {
123    /// Compute a `WorkspaceId` from the canonical identity inputs of a
124    /// [`WorkspaceIdentity`]. The hashing scheme is documented in
125    /// `03_IMPLEMENTATION_PLAN.md` §1 and tested via the round-trip
126    /// stability tests in `workspace::tests`.
127    #[must_use]
128    pub fn from_identity(identity: &WorkspaceIdentity) -> Self {
129        let mut hasher = Hasher::new();
130        identity.write_hash_input(&mut hasher);
131        Self(*hasher.finalize().as_bytes())
132    }
133
134    /// Borrow the raw 32-byte digest.
135    #[must_use]
136    pub fn as_bytes(&self) -> &[u8; 32] {
137        &self.0
138    }
139
140    /// First 16 hex characters of the digest. Suitable for log lines and
141    /// short file names; **not** sufficient for cross-process identity.
142    #[must_use]
143    pub fn as_short_hex(&self) -> String {
144        let full = self.as_full_hex();
145        full[..16].to_string()
146    }
147
148    /// Full 64-character hex digest. Use this for any identity comparison.
149    #[must_use]
150    pub fn as_full_hex(&self) -> String {
151        use std::fmt::Write as _;
152        let mut s = String::with_capacity(64);
153        for byte in &self.0 {
154            // `write!` to a `String` is infallible.
155            let _ = write!(s, "{byte:02x}");
156        }
157        s
158    }
159}
160
161impl std::fmt::Display for WorkspaceId {
162    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
163        f.write_str(&self.as_short_hex())
164    }
165}
166
167// ---------------------------------------------------------------------------
168// WorkspaceIdentity
169// ---------------------------------------------------------------------------
170
171/// The identity inputs of a logical workspace. `WorkspaceId` is computed
172/// deterministically from these inputs.
173#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
174#[serde(tag = "kind", rename_all = "camelCase")]
175pub enum WorkspaceIdentity {
176    /// Identity derived from a `.sqry-workspace` registry file.
177    SqryWorkspaceFile {
178        /// Canonical absolute path to the registry file.
179        path: PathBuf,
180        /// `true` if `path` could not be filesystem-canonicalized
181        /// (lexical fallback was used).
182        symlink_unresolved: bool,
183    },
184    /// Identity derived from a VS Code `.code-workspace` file.
185    VsCodeWorkspaceFile {
186        /// Canonical absolute path to the workspace file.
187        path: PathBuf,
188        /// `true` if `path` could not be filesystem-canonicalized.
189        symlink_unresolved: bool,
190    },
191    /// Identity derived from an ad-hoc multi-folder VS Code workspace.
192    AnonymousMultiRoot {
193        /// Folder roots, **sorted lexically** before hashing for stability.
194        folders: Vec<PathBuf>,
195        /// `true` if any of the folders could not be canonicalized.
196        symlink_unresolved: bool,
197    },
198    /// Identity derived from a single root path (e.g. `sqry index <path>`).
199    SingleRoot {
200        /// Canonical absolute root path.
201        path: PathBuf,
202        /// `true` if `path` could not be canonicalized.
203        symlink_unresolved: bool,
204    },
205}
206
207impl WorkspaceIdentity {
208    /// Tag byte used in the BLAKE3 hash input. Stable; do not renumber.
209    fn tag_byte(&self) -> u8 {
210        match self {
211            Self::SqryWorkspaceFile { .. } => 0,
212            Self::VsCodeWorkspaceFile { .. } => 1,
213            Self::AnonymousMultiRoot { .. } => 2,
214            Self::SingleRoot { .. } => 3,
215        }
216    }
217
218    /// `symlink_unresolved` flag — recorded in the hash input.
219    fn symlink_unresolved(&self) -> bool {
220        match self {
221            Self::SqryWorkspaceFile {
222                symlink_unresolved, ..
223            }
224            | Self::VsCodeWorkspaceFile {
225                symlink_unresolved, ..
226            }
227            | Self::AnonymousMultiRoot {
228                symlink_unresolved, ..
229            }
230            | Self::SingleRoot {
231                symlink_unresolved, ..
232            } => *symlink_unresolved,
233        }
234    }
235
236    /// Write the deterministic hash input for `WorkspaceId` derivation.
237    fn write_hash_input(&self, hasher: &mut Hasher) {
238        hasher.update(&[self.tag_byte()]);
239        // 0x00 / 0x01 byte for symlink_unresolved.
240        hasher.update(&[u8::from(self.symlink_unresolved())]);
241        match self {
242            Self::SqryWorkspaceFile { path, .. }
243            | Self::VsCodeWorkspaceFile { path, .. }
244            | Self::SingleRoot { path, .. } => {
245                hash_path(hasher, path);
246            }
247            Self::AnonymousMultiRoot { folders, .. } => {
248                let count = u32::try_from(folders.len()).unwrap_or(u32::MAX);
249                hasher.update(&count.to_le_bytes());
250                for folder in folders {
251                    hash_path(hasher, folder);
252                }
253            }
254        }
255    }
256}
257
258/// Hash a single canonical path: u32 LE byte length followed by the
259/// path's UTF-8 bytes (lossy if the path is not valid UTF-8 — extremely
260/// unusual on supported targets, but we never panic).
261fn hash_path(hasher: &mut Hasher, path: &Path) {
262    let s = path.to_string_lossy();
263    let bytes = s.as_bytes();
264    let len = u32::try_from(bytes.len()).unwrap_or(u32::MAX);
265    hasher.update(&len.to_le_bytes());
266    hasher.update(bytes);
267}
268
269// ---------------------------------------------------------------------------
270// SourceRoot, MemberFolder, Classification
271// ---------------------------------------------------------------------------
272
273/// A directory that is auto-indexed by sqry. One `.sqry/graph/manifest.json`
274/// per source root.
275#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
276pub struct SourceRoot {
277    /// Canonical absolute path to the source root.
278    pub path: PathBuf,
279    /// Path to the per-source-root index manifest (always
280    /// `<path>/.sqry/graph/manifest.json`).
281    pub index_path: PathBuf,
282    /// Optional list of language hints to bias plugin selection.
283    pub language_hints: Option<Vec<String>>,
284    /// Optional path to the JVM classpath cache directory
285    /// (`<path>/.sqry/classpath/`); populated by the JVM pipeline.
286    pub classpath_dir: Option<PathBuf>,
287    /// Per-source-root override of the workspace-level
288    /// `config_fingerprint`. `0` here; populated by the plugin-selection
289    /// pipeline in a later step.
290    pub config_fingerprint: u64,
291}
292
293impl SourceRoot {
294    /// Build a `SourceRoot` from a canonical path, deriving the standard
295    /// `.sqry/graph/manifest.json` index path. `language_hints` is
296    /// `None`; `classpath_dir` is `None`; `config_fingerprint` is `0`.
297    #[must_use]
298    pub fn from_path(path: PathBuf) -> Self {
299        let index_path = path.join(".sqry").join("graph").join("manifest.json");
300        Self {
301            path,
302            index_path,
303            language_hints: None,
304            classpath_dir: None,
305            config_fingerprint: 0,
306        }
307    }
308
309    /// STEP_11_4 — populate [`Self::classpath_dir`] from a probe of
310    /// `<self.path>/.sqry/classpath/`. The directory is set when the
311    /// probe finds a *directory* at that path, leaving the field
312    /// `None` when the path is missing or is not a directory.
313    ///
314    /// Returns `Ok(())` on a successful (possibly negative) probe.
315    /// Returns the raw [`io::Error`] when the probe fails for a reason
316    /// other than `NotFound` (e.g. permission denied) so callers can
317    /// surface a [`super::cache::WorkspaceWarning::ClasspathProbeFailed`]
318    /// without losing the underlying error detail.
319    ///
320    /// # Errors
321    ///
322    /// Returns the underlying [`io::Error`] when [`fs::metadata`] fails
323    /// for a reason other than `NotFound`.
324    pub fn populate_classpath_dir(&mut self) -> io::Result<()> {
325        let probe = self.path.join(".sqry").join("classpath");
326        match fs::metadata(&probe) {
327            Ok(meta) if meta.is_dir() => {
328                self.classpath_dir = Some(probe);
329                Ok(())
330            }
331            Ok(_) => {
332                // Path exists but is not a directory — treat as
333                // "no classpath present" without raising an error.
334                self.classpath_dir = None;
335                Ok(())
336            }
337            Err(err) if err.kind() == io::ErrorKind::NotFound => {
338                self.classpath_dir = None;
339                Ok(())
340            }
341            Err(err) => Err(err),
342        }
343    }
344
345    /// STEP_11_4 — fluent builder for [`Self::config_fingerprint`].
346    ///
347    /// Used by call sites that hold a freshly computed
348    /// [`crate::config::compute_workspace_config_fingerprint`] value
349    /// alongside the source root. A fingerprint of `0` is the
350    /// "unset" sentinel and the builder accepts it for the same
351    /// reason `WorkspaceKey::config_fingerprint = 0` is the default
352    /// — call sites that want strict identity must supply a non-zero
353    /// value.
354    #[must_use]
355    pub fn with_config_fingerprint(mut self, fingerprint: u64) -> Self {
356        self.config_fingerprint = fingerprint;
357        self
358    }
359
360    /// STEP_11_4 — return the per-source-root config fingerprint with
361    /// fallback to a workspace-level default supplied by the caller.
362    ///
363    /// `SourceRoot.config_fingerprint == 0` is treated as "use the
364    /// workspace-level fingerprint" — the daemon `WorkspaceKey`
365    /// dimension this powers must always carry the workspace's
366    /// fingerprint when no per-source-root override is set so two
367    /// otherwise-identical paths under different workspaces stay in
368    /// distinct cache entries.
369    #[must_use]
370    pub fn effective_config_fingerprint(&self, workspace_default: u64) -> u64 {
371        if self.config_fingerprint == 0 {
372            workspace_default
373        } else {
374            self.config_fingerprint
375        }
376    }
377}
378
379/// Why a folder was classified as a member (rather than a source root or
380/// excluded path).
381#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
382#[serde(rename_all = "camelCase")]
383pub enum MemberReason {
384    /// Folder holds tooling / build / scripts, not first-class source.
385    OperationalFolder,
386    /// Folder exists but contains no plugin-recognized source files.
387    NonSourceFolder,
388    /// Heuristic could not match any registered language plugin
389    /// (last-resort default — see §1.1).
390    NoLanguagePluginMatch,
391}
392
393/// A folder that is part of the logical workspace but is **not**
394/// auto-indexed.
395#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
396pub struct MemberFolder {
397    /// Canonical absolute path to the member folder.
398    pub path: PathBuf,
399    /// Why the folder was classified as a member.
400    pub reason: MemberReason,
401}
402
403/// The result of [`LogicalWorkspace::classify`].
404#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
405#[serde(tag = "kind", rename_all = "camelCase")]
406pub enum Classification {
407    /// Path is a known source root or a descendant of one.
408    Source,
409    /// Path is a known member folder or a descendant of one.
410    Member {
411        /// Why the owning member folder was so classified.
412        reason: MemberReason,
413    },
414    /// Path was explicitly excluded.
415    Excluded,
416    /// Path is outside the logical workspace entirely.
417    Unknown,
418}
419
420/// Verdict returned by an injected heuristic classifier when a folder is
421/// not explicitly classified by the user.
422///
423/// The heuristic policy itself lives outside `sqry-core` (in the LSP /
424/// extension / wrapper); `sqry-core` accepts it as an injected
425/// `&dyn Fn(&Path) -> HeuristicVerdict` so policy stays separate from the
426/// data model.
427#[derive(Debug, Clone, PartialEq, Eq)]
428pub enum HeuristicVerdict {
429    /// Folder should be treated as a source root.
430    Source,
431    /// Folder should be treated as a member (with a specific reason).
432    Member {
433        /// Reason the folder is a member.
434        reason: MemberReason,
435    },
436    /// Folder should be excluded.
437    Excluded,
438    /// Heuristic could not classify; caller decides the last-resort
439    /// default.
440    Unknown,
441}
442
443// ---------------------------------------------------------------------------
444// LogicalWorkspace
445// ---------------------------------------------------------------------------
446
447/// A logical workspace — the unit of identity for cross-repo / workspace
448/// indexing.
449#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
450pub struct LogicalWorkspace {
451    identity: WorkspaceIdentity,
452    workspace_id: WorkspaceId,
453    source_roots: Vec<SourceRoot>,
454    member_folders: Vec<MemberFolder>,
455    exclusions: Vec<PathBuf>,
456    project_root_mode: ProjectRootMode,
457    index_root_override: Option<PathBuf>,
458    config_fingerprint: u64,
459}
460
461impl LogicalWorkspace {
462    /// Construct from a `.sqry-workspace` registry file.
463    ///
464    /// `WorkspaceRegistry::load` accepts both v1 (flat `repositories`
465    /// list) and v2 (`source_roots`, `member_folders`, `exclusions`,
466    /// `project_root_mode`) on-disk shapes — v1 is auto-upgraded to v2
467    /// in memory. This constructor projects every v2 field into the
468    /// resulting [`LogicalWorkspace`]:
469    ///
470    /// * `repositories` → `source_roots` (canonicalized).
471    /// * `member_folders` → `MemberFolder { path, reason }` (canonicalized).
472    /// * `exclusions` → canonical absolute paths.
473    /// * `project_root_mode` → carried verbatim.
474    ///
475    /// `STEP_7` codex iter4 fix — pre-iter4 this constructor dropped
476    /// `member_folders`, `exclusions`, and `project_root_mode` on the
477    /// floor, defeating acceptance criteria 5/6 end-to-end (the redactor
478    /// receives an empty `LogicalWorkspaceView::exclusions` /
479    /// `member_folders`, so `redact_excluded_in_passthrough` and the
480    /// member-folder prefix renderer never fire on real
481    /// `.sqry-workspace`-loaded sessions). The pre-iter4 inline TODO
482    /// pointed at "STEP_2 will overhaul the registry layer entirely" —
483    /// STEP_2 shipped the registry-side v2 schema but did not update this
484    /// projection. Fixed here so STEP_7's MCP redaction wiring is
485    /// observable end-to-end.
486    ///
487    /// # Errors
488    ///
489    /// Returns [`LogicalWorkspaceError`] when the registry file cannot be
490    /// loaded or any path canonicalization fails irrecoverably.
491    pub fn from_sqry_workspace(path: &Path) -> Result<Self, LogicalWorkspaceError> {
492        // Load the registry. v1 files are auto-upgraded to v2 in memory
493        // by `WorkspaceRegistry::load`; we propagate serde errors as a
494        // dedicated variant so callers can distinguish parse failures
495        // from IO.
496        let registry = WorkspaceRegistry::load(path).map_err(|err| match err {
497            super::error::WorkspaceError::Serialization(e) => {
498                LogicalWorkspaceError::ParseSqryWorkspace(e)
499            }
500            super::error::WorkspaceError::Io { source, .. } => LogicalWorkspaceError::Io(source),
501            other => LogicalWorkspaceError::Io(io::Error::other(other.to_string())),
502        })?;
503
504        let (canonical_path, symlink_unresolved) = canonicalize_with_flag(path)?;
505        let identity = WorkspaceIdentity::SqryWorkspaceFile {
506            path: maybe_lowercase(&canonical_path),
507            symlink_unresolved,
508        };
509        let workspace_id = WorkspaceId::from_identity(&identity);
510
511        let mut source_roots = Vec::with_capacity(registry.repositories.len());
512        for repo in &registry.repositories {
513            let (canonical_repo, _unresolved) = canonicalize_with_flag(&repo.root)?;
514            let mut root = SourceRoot::from_path(canonical_repo);
515            // Preserve the registry-supplied index_path if it points at
516            // a real manifest (registry v1 uses `<repo>/.sqry-index`,
517            // not `.sqry/graph/manifest.json`); leave `from_path`'s
518            // computed manifest path otherwise.
519            root.index_path.clone_from(&repo.index_path);
520            if let Some(lang) = repo.primary_language.clone() {
521                root.language_hints = Some(vec![lang]);
522            }
523            source_roots.push(root);
524        }
525
526        // v2 projection: carry member_folders, exclusions, and
527        // project_root_mode through to the LogicalWorkspace so the MCP
528        // redactor (and any other consumer of `member_folders()` /
529        // `exclusions()`) sees the same structure the registry persists.
530        let mut member_folders = Vec::with_capacity(registry.member_folders.len());
531        for member in &registry.member_folders {
532            let (canonical_root, _unresolved) = canonicalize_with_flag(&member.root)?;
533            member_folders.push(MemberFolder {
534                path: canonical_root,
535                reason: member.reason,
536            });
537        }
538
539        let mut exclusions = Vec::with_capacity(registry.exclusions.len());
540        for excluded in &registry.exclusions {
541            let (canonical_excluded, _unresolved) = canonicalize_with_flag(excluded)?;
542            exclusions.push(canonical_excluded);
543        }
544
545        let mut ws = Self {
546            identity,
547            workspace_id,
548            source_roots,
549            member_folders,
550            exclusions,
551            project_root_mode: registry.project_root_mode,
552            index_root_override: None,
553            config_fingerprint: 0,
554        };
555        // STEP_11_4 — auto-populate classpath_dir on every source root.
556        let _failures = ws.populate_classpath_dirs();
557        Ok(ws)
558    }
559
560    /// Construct from a `.code-workspace` JSON file.
561    ///
562    /// The `heuristic_fn` is invoked for every folder that does not carry
563    /// an explicit `sqry.role`, is not in the top-level
564    /// `sqry.workspace.sourceRoots` / `.exclusions` overrides, and is not
565    /// already classified as a member by an explicit
566    /// `sqry.workspace.memberFolders` entry.
567    ///
568    /// # Errors
569    ///
570    /// Returns [`LogicalWorkspaceError`] for IO failures, JSON parse
571    /// errors, malformed folder entries, or path canonicalization
572    /// failures that cannot be recovered via lexical absolutization.
573    #[allow(clippy::too_many_lines)] // single-pass classifier; splitting hurts clarity.
574    pub fn from_code_workspace(
575        workspace_file: &Path,
576        heuristic_fn: &dyn Fn(&Path) -> HeuristicVerdict,
577    ) -> Result<Self, LogicalWorkspaceError> {
578        let bytes = fs::read(workspace_file)?;
579        let json: serde_json::Value =
580            serde_json::from_slice(&bytes).map_err(LogicalWorkspaceError::ParseCodeWorkspace)?;
581
582        let workspace_dir = workspace_file
583            .parent()
584            .map_or_else(|| PathBuf::from("."), Path::to_path_buf);
585
586        // Resolve the canonical workspace-file path for identity.
587        let (canonical_workspace_file, symlink_unresolved) =
588            canonicalize_with_flag(workspace_file)?;
589        let identity = WorkspaceIdentity::VsCodeWorkspaceFile {
590            path: maybe_lowercase(&canonical_workspace_file),
591            symlink_unresolved,
592        };
593        let workspace_id = WorkspaceId::from_identity(&identity);
594
595        // Collect folder entries. Per the .code-workspace spec each
596        // folder has a `path` (required) and optional `name`, plus
597        // sqry-specific `sqry.role`.
598        let folders_v = json.get("folders").cloned().unwrap_or_default();
599        let folders_arr = folders_v.as_array().cloned().unwrap_or_default();
600
601        // Top-level sqry.workspace overrides.
602        let sqry_top = json.get("sqry.workspace");
603        let top_source_roots = path_set_from_value(sqry_top, "sourceRoots", &workspace_dir);
604        let top_exclusions = path_set_from_value(sqry_top, "exclusions", &workspace_dir);
605        let top_members = member_overrides_from_value(sqry_top, &workspace_dir)?;
606        let project_root_mode = sqry_top
607            .and_then(|v| v.get("projectRootMode"))
608            .and_then(|v| v.as_str())
609            .and_then(ProjectRootMode::from_str_opt)
610            .unwrap_or_default();
611
612        // Build per-path classification map. The key is the absolute
613        // path *as configured*; we canonicalize at the end.
614        let mut classified: BTreeMap<PathBuf, FolderClassKind> = BTreeMap::new();
615        let mut all_folders: Vec<PathBuf> = Vec::new();
616
617        for (idx, entry) in folders_arr.iter().enumerate() {
618            let raw_path = entry.get("path").and_then(|v| v.as_str()).ok_or_else(|| {
619                LogicalWorkspaceError::MalformedFolderEntry {
620                    reason: format!("folders[{idx}] missing string `path`"),
621                }
622            })?;
623            let abs = if Path::new(raw_path).is_absolute() {
624                PathBuf::from(raw_path)
625            } else {
626                workspace_dir.join(raw_path)
627            };
628            all_folders.push(abs.clone());
629
630            // Step 4: explicit per-folder `sqry.role` always wins.
631            if let Some(role) = entry.get("sqry.role").and_then(|v| v.as_str()) {
632                let kind = match role {
633                    "source" => FolderClassKind::Source,
634                    "operational" => FolderClassKind::Member(MemberReason::OperationalFolder),
635                    "non-source" | "nonSource" | "non_source" => {
636                        FolderClassKind::Member(MemberReason::NonSourceFolder)
637                    }
638                    "excluded" => FolderClassKind::Excluded,
639                    other => {
640                        return Err(LogicalWorkspaceError::MalformedFolderEntry {
641                            reason: format!(
642                                "folders[{idx}].sqry.role = '{other}' (expected source|operational|excluded|non-source)"
643                            ),
644                        });
645                    }
646                };
647                classified.insert(abs, kind);
648                continue;
649            }
650
651            // Step 5: top-level sqry.workspace overrides.
652            if top_exclusions.contains(&abs) {
653                classified.insert(abs, FolderClassKind::Excluded);
654                continue;
655            }
656            if top_source_roots.contains(&abs) {
657                classified.insert(abs, FolderClassKind::Source);
658                continue;
659            }
660            if let Some(reason) = top_members.get(&abs).copied() {
661                classified.insert(abs, FolderClassKind::Member(reason));
662                continue;
663            }
664
665            // Step 6: heuristic fallback.
666            let verdict = heuristic_fn(&abs);
667            let kind = match verdict {
668                HeuristicVerdict::Source => FolderClassKind::Source,
669                HeuristicVerdict::Member { reason } => FolderClassKind::Member(reason),
670                HeuristicVerdict::Excluded => FolderClassKind::Excluded,
671                HeuristicVerdict::Unknown => {
672                    // Step 7: last-resort default for unclassified folders.
673                    FolderClassKind::Member(MemberReason::NoLanguagePluginMatch)
674                }
675            };
676            classified.insert(abs, kind);
677        }
678
679        // Top-level overrides may reference paths that were not present in
680        // the `folders[]` array. Honor them too.
681        for path in &top_source_roots {
682            classified
683                .entry(path.clone())
684                .or_insert(FolderClassKind::Source);
685        }
686        for path in &top_exclusions {
687            classified
688                .entry(path.clone())
689                .or_insert(FolderClassKind::Excluded);
690        }
691        for (path, reason) in &top_members {
692            classified
693                .entry(path.clone())
694                .or_insert(FolderClassKind::Member(*reason));
695        }
696
697        // Materialize.
698        let mut source_roots = Vec::new();
699        let mut member_folders = Vec::new();
700        let mut exclusions = Vec::new();
701        for (raw_path, kind) in classified {
702            let (canonical, _unresolved) = canonicalize_with_flag(&raw_path)?;
703            let canonical = maybe_lowercase(&canonical);
704            match kind {
705                FolderClassKind::Source => source_roots.push(SourceRoot::from_path(canonical)),
706                FolderClassKind::Member(reason) => member_folders.push(MemberFolder {
707                    path: canonical,
708                    reason,
709                }),
710                FolderClassKind::Excluded => exclusions.push(canonical),
711            }
712        }
713
714        let mut ws = Self {
715            identity,
716            workspace_id,
717            source_roots,
718            member_folders,
719            exclusions,
720            project_root_mode,
721            index_root_override: None,
722            config_fingerprint: 0,
723        };
724        let _failures = ws.populate_classpath_dirs();
725        Ok(ws)
726    }
727
728    /// Construct an ad-hoc multi-root workspace (every folder is a source
729    /// root). Folders are sorted lexically before hashing so identity is
730    /// stable under reorder.
731    ///
732    /// # Errors
733    ///
734    /// Returns [`LogicalWorkspaceError`] if any folder cannot be
735    /// canonicalized irrecoverably.
736    #[allow(clippy::needless_pass_by_value)] // owning constructor.
737    pub fn anonymous_multi_root(folders: Vec<PathBuf>) -> Result<Self, LogicalWorkspaceError> {
738        let mut canonical_folders = Vec::with_capacity(folders.len());
739        let mut symlink_unresolved = false;
740        for folder in &folders {
741            let (canon, unresolved) = canonicalize_with_flag(folder)?;
742            symlink_unresolved |= unresolved;
743            canonical_folders.push(maybe_lowercase(&canon));
744        }
745        canonical_folders.sort();
746        let identity = WorkspaceIdentity::AnonymousMultiRoot {
747            folders: canonical_folders.clone(),
748            symlink_unresolved,
749        };
750        let workspace_id = WorkspaceId::from_identity(&identity);
751
752        let source_roots = canonical_folders
753            .iter()
754            .cloned()
755            .map(SourceRoot::from_path)
756            .collect();
757
758        let mut ws = Self {
759            identity,
760            workspace_id,
761            source_roots,
762            member_folders: Vec::new(),
763            exclusions: Vec::new(),
764            project_root_mode: ProjectRootMode::default(),
765            index_root_override: None,
766            config_fingerprint: 0,
767        };
768        let _failures = ws.populate_classpath_dirs();
769        Ok(ws)
770    }
771
772    /// Construct a single-root workspace (one source root, no members).
773    ///
774    /// # Errors
775    ///
776    /// Returns [`LogicalWorkspaceError`] if `path` cannot be canonicalized
777    /// irrecoverably.
778    #[allow(clippy::needless_pass_by_value)] // owning constructor.
779    pub fn single_root(path: PathBuf) -> Result<Self, LogicalWorkspaceError> {
780        let (canonical, symlink_unresolved) = canonicalize_with_flag(&path)?;
781        let canonical = maybe_lowercase(&canonical);
782        let identity = WorkspaceIdentity::SingleRoot {
783            path: canonical.clone(),
784            symlink_unresolved,
785        };
786        let workspace_id = WorkspaceId::from_identity(&identity);
787        let mut ws = Self {
788            identity,
789            workspace_id,
790            source_roots: vec![SourceRoot::from_path(canonical)],
791            member_folders: Vec::new(),
792            exclusions: Vec::new(),
793            project_root_mode: ProjectRootMode::default(),
794            index_root_override: None,
795            config_fingerprint: 0,
796        };
797        let _failures = ws.populate_classpath_dirs();
798        Ok(ws)
799    }
800
801    /// Test-only seam: construct a single-root workspace with the
802    /// case-sensitivity decision *forced* to `case_insensitive`,
803    /// bypassing live mount detection. Used by the
804    /// `case_insensitive_mount_produces_same_id_end_to_end` test to
805    /// exercise acceptance criterion 4 deterministically on
806    /// case-sensitive Linux hosts (where the live detector would
807    /// otherwise return `false` and short-circuit the lowercase path).
808    ///
809    /// The path is canonicalized via `path_utils::canonicalize_path`
810    /// for parity with [`Self::single_root`], but the case-folding
811    /// step uses the explicit `case_insensitive` argument instead of
812    /// `is_case_insensitive_mount`.
813    #[cfg(test)]
814    #[allow(clippy::needless_pass_by_value)]
815    pub(crate) fn single_root_with_case_sensitivity(
816        path: PathBuf,
817        case_insensitive: bool,
818    ) -> Result<Self, LogicalWorkspaceError> {
819        let (canonical, symlink_unresolved) = canonicalize_with_flag(&path)?;
820        let canonical = if case_insensitive {
821            PathBuf::from(canonical.to_string_lossy().to_lowercase())
822        } else {
823            canonical
824        };
825        let identity = WorkspaceIdentity::SingleRoot {
826            path: canonical.clone(),
827            symlink_unresolved,
828        };
829        let workspace_id = WorkspaceId::from_identity(&identity);
830        Ok(Self {
831            identity,
832            workspace_id,
833            source_roots: vec![SourceRoot::from_path(canonical)],
834            member_folders: Vec::new(),
835            exclusions: Vec::new(),
836            project_root_mode: ProjectRootMode::default(),
837            index_root_override: None,
838            config_fingerprint: 0,
839        })
840    }
841
842    // -- Accessors --
843
844    /// The stable BLAKE3-256 identity of this workspace.
845    #[must_use]
846    pub fn workspace_id(&self) -> &WorkspaceId {
847        &self.workspace_id
848    }
849
850    /// The identity inputs that produced [`Self::workspace_id`].
851    #[must_use]
852    pub fn identity(&self) -> &WorkspaceIdentity {
853        &self.identity
854    }
855
856    /// The auto-indexed source roots.
857    #[must_use]
858    pub fn source_roots(&self) -> &[SourceRoot] {
859        &self.source_roots
860    }
861
862    /// The non-indexed member folders.
863    #[must_use]
864    pub fn member_folders(&self) -> &[MemberFolder] {
865        &self.member_folders
866    }
867
868    /// Explicitly excluded paths.
869    #[must_use]
870    pub fn exclusions(&self) -> &[PathBuf] {
871        &self.exclusions
872    }
873
874    /// The workspace-level [`ProjectRootMode`].
875    #[must_use]
876    pub fn project_root_mode(&self) -> ProjectRootMode {
877        self.project_root_mode
878    }
879
880    /// Optional `--index-root` override.
881    #[must_use]
882    pub fn index_root_override(&self) -> Option<&Path> {
883        self.index_root_override.as_deref()
884    }
885
886    /// Workspace-level config fingerprint. Populated by the
887    /// plugin-selection / cost-tier pipeline via
888    /// [`Self::set_config_fingerprint`] and consumed by
889    /// `sqry-daemon::WorkspaceKey` so two source roots sharing path
890    /// but differing fingerprint stay in distinct cache entries.
891    #[must_use]
892    pub fn config_fingerprint(&self) -> u64 {
893        self.config_fingerprint
894    }
895
896    /// STEP_11_4 — set the workspace-level config fingerprint computed
897    /// via [`crate::config::compute_workspace_config_fingerprint`].
898    ///
899    /// The fingerprint is **not** part of the [`WorkspaceId`] hash
900    /// input — it is a separate cache dimension consumed by the
901    /// daemon's `WorkspaceKey`. Two `LogicalWorkspace`s with the
902    /// same identity but different fingerprints share an identity but
903    /// produce distinct daemon cache entries.
904    pub fn set_config_fingerprint(&mut self, fingerprint: u64) {
905        self.config_fingerprint = fingerprint;
906    }
907
908    /// STEP_11_4 — set the workspace-level config fingerprint and
909    /// propagate it to every [`SourceRoot`] that does not already
910    /// carry an explicit per-root override (i.e. whose
911    /// `config_fingerprint == 0`).
912    ///
913    /// This is the typical wiring point: callers compute one
914    /// workspace-level fingerprint, then call
915    /// `set_config_fingerprint_with_inheritance` so source roots
916    /// without an explicit override inherit the workspace value.
917    /// Source roots that carry a non-zero override are left
918    /// untouched.
919    pub fn set_config_fingerprint_with_inheritance(&mut self, fingerprint: u64) {
920        self.config_fingerprint = fingerprint;
921        for root in &mut self.source_roots {
922            if root.config_fingerprint == 0 {
923                root.config_fingerprint = fingerprint;
924            }
925        }
926    }
927
928    /// STEP_11_4 — populate every [`SourceRoot::classpath_dir`] in this
929    /// workspace by probing `<root>/.sqry/classpath/` for each. Returns
930    /// a vector of `(source_root, io::Error)` pairs for any probe that
931    /// failed for a reason other than `NotFound`; callers typically
932    /// fold these into [`super::cache::WorkspaceWarning::ClasspathProbeFailed`].
933    pub fn populate_classpath_dirs(&mut self) -> Vec<(PathBuf, io::Error)> {
934        let mut failures = Vec::new();
935        for root in &mut self.source_roots {
936            if let Err(err) = root.populate_classpath_dir() {
937                failures.push((root.path.clone(), err));
938            }
939        }
940        failures
941    }
942
943    /// Returns `true` if `path` matches one of the registered source
944    /// roots exactly (not a descendant).
945    #[must_use]
946    pub fn is_source_root(&self, path: &Path) -> bool {
947        let canonical =
948            canonicalize_path(path).map_or_else(|_| path.to_path_buf(), |p| maybe_lowercase(&p));
949        self.source_roots.iter().any(|r| r.path == canonical)
950    }
951
952    /// Classify a path against the workspace per §1.4 of the
953    /// implementation plan.
954    #[must_use]
955    pub fn classify(&self, path: &Path) -> Classification {
956        let canonical =
957            canonicalize_path(path).map_or_else(|_| path.to_path_buf(), |p| maybe_lowercase(&p));
958
959        // 1. Exclusion match (exact or descendant).
960        if self
961            .exclusions
962            .iter()
963            .any(|excl| path_matches(&canonical, excl))
964        {
965            return Classification::Excluded;
966        }
967
968        // 2. Source root or descendant of one.
969        if self
970            .source_roots
971            .iter()
972            .any(|r| path_matches(&canonical, &r.path))
973        {
974            return Classification::Source;
975        }
976
977        // 3. Member folder or descendant of one.
978        for member in &self.member_folders {
979            if path_matches(&canonical, &member.path) {
980                return Classification::Member {
981                    reason: member.reason,
982                };
983            }
984        }
985
986        // 4. Outside the logical workspace entirely.
987        Classification::Unknown
988    }
989}
990
991// ---------------------------------------------------------------------------
992// Helpers
993// ---------------------------------------------------------------------------
994
995/// Internal classifier used while building from a `.code-workspace`.
996#[derive(Debug, Clone, Copy)]
997enum FolderClassKind {
998    Source,
999    Member(MemberReason),
1000    Excluded,
1001}
1002
1003/// `true` if `path == prefix` or `path` is a descendant of `prefix`.
1004fn path_matches(path: &Path, prefix: &Path) -> bool {
1005    path == prefix || path.starts_with(prefix)
1006}
1007
1008/// Canonicalize a path and report whether the filesystem could resolve it.
1009///
1010/// The actual canonicalization is delegated to
1011/// [`crate::project::path_utils::canonicalize_path`] — the project-wide
1012/// source of truth which already handles the `realpath(3)` / lexical
1013/// fallback split. The `symlink_unresolved` flag is derived from a
1014/// separate `std::fs::canonicalize(path).is_ok()` probe purely so the
1015/// caller can record in the identity inputs whether the canonical path
1016/// came from the live filesystem or from the lexical fallback.
1017fn canonicalize_with_flag(path: &Path) -> Result<(PathBuf, bool), LogicalWorkspaceError> {
1018    // Probe whether realpath(3) would have succeeded. We deliberately
1019    // do NOT use the resulting path — the canonical path itself is
1020    // produced by `path_utils::canonicalize_path` so the entire
1021    // workspace stack uses one source-of-truth canonicalizer.
1022    let real_canon_succeeded = fs::canonicalize(path).is_ok();
1023
1024    let canonical =
1025        canonicalize_path(path).map_err(|source| LogicalWorkspaceError::Canonicalization {
1026            path: path.to_path_buf(),
1027            source,
1028        })?;
1029
1030    Ok((canonical, !real_canon_succeeded))
1031}
1032
1033/// Apply best-effort case-insensitive normalization. On case-sensitive
1034/// mounts this is a no-op. On case-insensitive mounts we lowercase the
1035/// path so case-variant inputs collapse to the same `WorkspaceId`.
1036fn maybe_lowercase(path: &Path) -> PathBuf {
1037    if is_case_insensitive_mount(path) {
1038        let s = path.to_string_lossy().to_lowercase();
1039        PathBuf::from(s)
1040    } else {
1041        path.to_path_buf()
1042    }
1043}
1044
1045/// Best-effort detection of whether `path` lives on a case-insensitive
1046/// mount. We avoid platform-specific `statvfs` plumbing here; the
1047/// detection is conservative.
1048///
1049/// - If `path` exists and a lowercase variant is present and
1050///   round-trips to the same canonical path, the mount is treated as
1051///   case-insensitive.
1052/// - On Linux the kernel default is case-sensitive; the round-trip
1053///   check therefore returns `false` for almost all paths.
1054/// - On macOS HFS+/APFS (default case-insensitive) and Windows
1055///   NTFS/ReFS the round-trip succeeds and we lowercase.
1056///
1057/// The algorithm never panics and never blocks on slow IO — it does at
1058/// most two `metadata()` calls.
1059fn is_case_insensitive_mount(path: &Path) -> bool {
1060    // Find a path component we can mutate. If the path string contains
1061    // no ASCII alphabetic characters there is nothing to vary, so
1062    // assume case-sensitive.
1063    let s = path.to_string_lossy();
1064    if !s.chars().any(|c| c.is_ascii_alphabetic()) {
1065        return false;
1066    }
1067    // Cheap fast path: try the lowercased and uppercased variants and
1068    // see whether both resolve to the same metadata as the original.
1069    let Ok(orig) = fs::metadata(path) else {
1070        return false;
1071    };
1072    let lower = PathBuf::from(s.to_lowercase());
1073    let upper = PathBuf::from(s.to_uppercase());
1074
1075    let lower_ok = fs::metadata(&lower)
1076        .ok()
1077        .filter(|m| same_inode(m, &orig))
1078        .is_some();
1079    let upper_ok = fs::metadata(&upper)
1080        .ok()
1081        .filter(|m| same_inode(m, &orig))
1082        .is_some();
1083
1084    // We require *both* round-trips to succeed (and at least one of them
1085    // to actually be a different string than the original — otherwise
1086    // the test is trivially true even on case-sensitive FS where
1087    // `path == s.to_lowercase()` already).
1088    let varies = lower != path || upper != path;
1089    varies && lower_ok && upper_ok
1090}
1091
1092#[cfg(unix)]
1093fn same_inode(a: &fs::Metadata, b: &fs::Metadata) -> bool {
1094    use std::os::unix::fs::MetadataExt;
1095    a.ino() == b.ino() && a.dev() == b.dev()
1096}
1097
1098#[cfg(not(unix))]
1099fn same_inode(a: &fs::Metadata, b: &fs::Metadata) -> bool {
1100    // Best-effort on non-Unix: fall back to size + modified-time
1101    // equality. This is conservative — false positives would only
1102    // cause a case-insensitive lowercase-pass on a case-sensitive
1103    // mount, which is harmless for identity stability since both
1104    // case variants would already be the same path.
1105    a.len() == b.len() && a.modified().ok() == b.modified().ok()
1106}
1107
1108/// Parse a `sqry.workspace.<key>` string array into a set of absolute
1109/// paths anchored at `base_dir`.
1110fn path_set_from_value(
1111    sqry_top: Option<&serde_json::Value>,
1112    key: &str,
1113    base_dir: &Path,
1114) -> std::collections::BTreeSet<PathBuf> {
1115    let mut set = std::collections::BTreeSet::new();
1116    let Some(top) = sqry_top else { return set };
1117    let Some(arr) = top.get(key).and_then(|v| v.as_array()) else {
1118        return set;
1119    };
1120    for item in arr {
1121        if let Some(s) = item.as_str() {
1122            let p = if Path::new(s).is_absolute() {
1123                PathBuf::from(s)
1124            } else {
1125                base_dir.join(s)
1126            };
1127            set.insert(p);
1128        }
1129    }
1130    set
1131}
1132
1133/// Parse `sqry.workspace.memberFolders`: either a `["path", ...]` array
1134/// (defaults to `OperationalFolder`) or an array of objects
1135/// `{ "path": "...", "reason": "operational" }`.
1136fn member_overrides_from_value(
1137    sqry_top: Option<&serde_json::Value>,
1138    base_dir: &Path,
1139) -> Result<BTreeMap<PathBuf, MemberReason>, LogicalWorkspaceError> {
1140    let mut map = BTreeMap::new();
1141    let Some(top) = sqry_top else { return Ok(map) };
1142    let Some(arr) = top.get("memberFolders").and_then(|v| v.as_array()) else {
1143        return Ok(map);
1144    };
1145    for (idx, item) in arr.iter().enumerate() {
1146        let (path_str, reason) = if let Some(s) = item.as_str() {
1147            (s.to_string(), MemberReason::OperationalFolder)
1148        } else if let Some(obj) = item.as_object() {
1149            let path = obj
1150                .get("path")
1151                .and_then(|v| v.as_str())
1152                .ok_or_else(|| LogicalWorkspaceError::MalformedFolderEntry {
1153                    reason: format!(
1154                        "sqry.workspace.memberFolders[{idx}] object missing string `path`"
1155                    ),
1156                })?
1157                .to_string();
1158            // The "operational" and `_` arms intentionally share a body:
1159            // explicit "operational" is documented; unknown strings fall
1160            // back to the same default. Keep the arms separated so the
1161            // explicit-keyword behaviour is visible in code review.
1162            #[allow(clippy::match_same_arms)]
1163            let reason = obj.get("reason").and_then(|v| v.as_str()).map_or(
1164                MemberReason::OperationalFolder,
1165                |s| match s {
1166                    "operational" => MemberReason::OperationalFolder,
1167                    "non-source" | "nonSource" | "non_source" => MemberReason::NonSourceFolder,
1168                    "noLanguagePluginMatch" | "no-language-plugin-match" => {
1169                        MemberReason::NoLanguagePluginMatch
1170                    }
1171                    _ => MemberReason::OperationalFolder,
1172                },
1173            );
1174            (path, reason)
1175        } else {
1176            return Err(LogicalWorkspaceError::MalformedFolderEntry {
1177                reason: format!(
1178                    "sqry.workspace.memberFolders[{idx}] is neither a string nor an object"
1179                ),
1180            });
1181        };
1182        let abs = if Path::new(&path_str).is_absolute() {
1183            PathBuf::from(&path_str)
1184        } else {
1185            base_dir.join(&path_str)
1186        };
1187        map.insert(abs, reason);
1188    }
1189    Ok(map)
1190}