sqry_core/workspace/logical.rs
1//! Logical workspace data model.
2//!
3//! A `LogicalWorkspace` is the unit of identity for cross-repo / workspace-aware
4//! indexing. It carries:
5//!
6//! - The `WorkspaceIdentity` from which a stable `WorkspaceId` (BLAKE3-256
7//! digest) is derived.
8//! - A list of `SourceRoot`s — directories that are auto-indexed and queried.
9//! - A list of `MemberFolder`s — directories that are part of the workspace
10//! but **not** auto-indexed (they fall through to the workspace's source
11//! roots when queried).
12//! - An explicit `exclusions` list — paths that are opaque to sqry.
13//! - Workspace-scoped metadata: `ProjectRootMode`, optional
14//! `index_root_override`, and a `config_fingerprint` placeholder
15//! (populated by the plugin-selection / cost-tier pipeline in a later
16//! step).
17//!
18//! Exhaustive design + identity rules: see
19//! `docs/development/workspace-aware-cross-repo/03_IMPLEMENTATION_PLAN.md` §1.
20//!
21//! ## Identity-input canonicalization (deterministic)
22//!
23//! 1. Every path is funneled through
24//! [`crate::project::path_utils::canonicalize_path`], which resolves
25//! symlinks via `realpath(3)` when possible, falling back to lexical
26//! absolutization when the target does not exist.
27//! 2. When canonicalization falls back (i.e. the path could not be resolved
28//! against the filesystem), the surrounding identity records the fact
29//! via a `symlink_unresolved: bool`. The flag is part of the hash input,
30//! so a missing-then-existing path will produce a different
31//! `WorkspaceId` (this is correct — the shape of the workspace changed).
32//! 3. On case-insensitive mounts (best-effort detected per path) paths are
33//! lowercased before hashing, so case variants resolve to the same
34//! `WorkspaceId`. On case-sensitive filesystems (default on Linux) the
35//! detection returns `false` and paths are hashed verbatim.
36//! 4. `AnonymousMultiRoot.folders` is sorted lexically before hashing so a
37//! reorder of workspace folders is identity-preserving.
38//! 5. `config_fingerprint` is **not** included in the hash — it is a
39//! separate cache dimension.
40//!
41//! ## Member vs Excluded — the contract
42//!
43//! - `Source` paths and their descendants are owned by the source root.
44//! - `Member` paths are part of the logical workspace but not auto-indexed.
45//! Reads still resolve via the workspace's source roots; status returns
46//! the *aggregate* workspace status.
47//! - `Excluded` paths are opaque — searches return empty with an explicit
48//! `excluded` flag.
49//! - `Unknown` paths sit outside the workspace entirely.
50
51use std::collections::BTreeMap;
52use std::fs;
53use std::io;
54use std::path::{Path, PathBuf};
55
56use blake3::Hasher;
57use serde::{Deserialize, Serialize};
58use thiserror::Error;
59
60use crate::project::path_utils::canonicalize_path;
61use crate::project::types::ProjectRootMode;
62
63use super::registry::WorkspaceRegistry;
64
65// ---------------------------------------------------------------------------
66// Errors
67// ---------------------------------------------------------------------------
68
69/// Errors produced while constructing a [`LogicalWorkspace`].
70#[derive(Debug, Error)]
71pub enum LogicalWorkspaceError {
72 /// Generic IO failure that is not tied to a specific path.
73 #[error("io error: {0}")]
74 Io(#[from] io::Error),
75
76 /// A path could not be canonicalized.
77 #[error("failed to canonicalize {path}: {source}")]
78 Canonicalization {
79 /// The path that failed to canonicalize.
80 path: PathBuf,
81 /// Underlying IO error.
82 source: io::Error,
83 },
84
85 /// The legacy `.sqry-workspace` (registry v1) JSON could not be parsed.
86 #[error("failed to parse .sqry-workspace registry: {0}")]
87 ParseSqryWorkspace(serde_json::Error),
88
89 /// The `.code-workspace` JSON could not be parsed.
90 #[error("failed to parse .code-workspace file: {0}")]
91 ParseCodeWorkspace(serde_json::Error),
92
93 /// A `folders[i]` entry in a `.code-workspace` is malformed.
94 #[error("malformed .code-workspace folder entry: {reason}")]
95 MalformedFolderEntry {
96 /// Human-readable reason describing the malformed entry.
97 reason: String,
98 },
99
100 /// The same path was classified into two conflicting roles.
101 #[error("conflicting classification for {path}: {kinds}")]
102 ConflictingClassification {
103 /// The path with conflicting classifications.
104 path: PathBuf,
105 /// A description of the conflicting kinds.
106 kinds: String,
107 },
108}
109
110// ---------------------------------------------------------------------------
111// WorkspaceId — BLAKE3-256 typed digest
112// ---------------------------------------------------------------------------
113
114/// Stable identity for a [`LogicalWorkspace`].
115///
116/// 32 bytes (BLAKE3-256) over the canonicalized identity inputs.
117/// Never truncated to 64 bits — the full 256-bit space is used to keep the
118/// collision probability astronomically small across processes / caches.
119#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
120pub struct WorkspaceId([u8; 32]);
121
122impl WorkspaceId {
123 /// Compute a `WorkspaceId` from the canonical identity inputs of a
124 /// [`WorkspaceIdentity`]. The hashing scheme is documented in
125 /// `03_IMPLEMENTATION_PLAN.md` §1 and tested via the round-trip
126 /// stability tests in `workspace::tests`.
127 #[must_use]
128 pub fn from_identity(identity: &WorkspaceIdentity) -> Self {
129 let mut hasher = Hasher::new();
130 identity.write_hash_input(&mut hasher);
131 Self(*hasher.finalize().as_bytes())
132 }
133
134 /// Borrow the raw 32-byte digest.
135 #[must_use]
136 pub fn as_bytes(&self) -> &[u8; 32] {
137 &self.0
138 }
139
140 /// First 16 hex characters of the digest. Suitable for log lines and
141 /// short file names; **not** sufficient for cross-process identity.
142 #[must_use]
143 pub fn as_short_hex(&self) -> String {
144 let full = self.as_full_hex();
145 full[..16].to_string()
146 }
147
148 /// Full 64-character hex digest. Use this for any identity comparison.
149 #[must_use]
150 pub fn as_full_hex(&self) -> String {
151 use std::fmt::Write as _;
152 let mut s = String::with_capacity(64);
153 for byte in &self.0 {
154 // `write!` to a `String` is infallible.
155 let _ = write!(s, "{byte:02x}");
156 }
157 s
158 }
159}
160
161impl std::fmt::Display for WorkspaceId {
162 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
163 f.write_str(&self.as_short_hex())
164 }
165}
166
167// ---------------------------------------------------------------------------
168// WorkspaceIdentity
169// ---------------------------------------------------------------------------
170
171/// The identity inputs of a logical workspace. `WorkspaceId` is computed
172/// deterministically from these inputs.
173#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
174#[serde(tag = "kind", rename_all = "camelCase")]
175pub enum WorkspaceIdentity {
176 /// Identity derived from a `.sqry-workspace` registry file.
177 SqryWorkspaceFile {
178 /// Canonical absolute path to the registry file.
179 path: PathBuf,
180 /// `true` if `path` could not be filesystem-canonicalized
181 /// (lexical fallback was used).
182 symlink_unresolved: bool,
183 },
184 /// Identity derived from a VS Code `.code-workspace` file.
185 VsCodeWorkspaceFile {
186 /// Canonical absolute path to the workspace file.
187 path: PathBuf,
188 /// `true` if `path` could not be filesystem-canonicalized.
189 symlink_unresolved: bool,
190 },
191 /// Identity derived from an ad-hoc multi-folder VS Code workspace.
192 AnonymousMultiRoot {
193 /// Folder roots, **sorted lexically** before hashing for stability.
194 folders: Vec<PathBuf>,
195 /// `true` if any of the folders could not be canonicalized.
196 symlink_unresolved: bool,
197 },
198 /// Identity derived from a single root path (e.g. `sqry index <path>`).
199 SingleRoot {
200 /// Canonical absolute root path.
201 path: PathBuf,
202 /// `true` if `path` could not be canonicalized.
203 symlink_unresolved: bool,
204 },
205}
206
207impl WorkspaceIdentity {
208 /// Tag byte used in the BLAKE3 hash input. Stable; do not renumber.
209 fn tag_byte(&self) -> u8 {
210 match self {
211 Self::SqryWorkspaceFile { .. } => 0,
212 Self::VsCodeWorkspaceFile { .. } => 1,
213 Self::AnonymousMultiRoot { .. } => 2,
214 Self::SingleRoot { .. } => 3,
215 }
216 }
217
218 /// `symlink_unresolved` flag — recorded in the hash input.
219 fn symlink_unresolved(&self) -> bool {
220 match self {
221 Self::SqryWorkspaceFile {
222 symlink_unresolved, ..
223 }
224 | Self::VsCodeWorkspaceFile {
225 symlink_unresolved, ..
226 }
227 | Self::AnonymousMultiRoot {
228 symlink_unresolved, ..
229 }
230 | Self::SingleRoot {
231 symlink_unresolved, ..
232 } => *symlink_unresolved,
233 }
234 }
235
236 /// Write the deterministic hash input for `WorkspaceId` derivation.
237 fn write_hash_input(&self, hasher: &mut Hasher) {
238 hasher.update(&[self.tag_byte()]);
239 // 0x00 / 0x01 byte for symlink_unresolved.
240 hasher.update(&[u8::from(self.symlink_unresolved())]);
241 match self {
242 Self::SqryWorkspaceFile { path, .. }
243 | Self::VsCodeWorkspaceFile { path, .. }
244 | Self::SingleRoot { path, .. } => {
245 hash_path(hasher, path);
246 }
247 Self::AnonymousMultiRoot { folders, .. } => {
248 let count = u32::try_from(folders.len()).unwrap_or(u32::MAX);
249 hasher.update(&count.to_le_bytes());
250 for folder in folders {
251 hash_path(hasher, folder);
252 }
253 }
254 }
255 }
256}
257
258/// Hash a single canonical path: u32 LE byte length followed by the
259/// path's UTF-8 bytes (lossy if the path is not valid UTF-8 — extremely
260/// unusual on supported targets, but we never panic).
261fn hash_path(hasher: &mut Hasher, path: &Path) {
262 let s = path.to_string_lossy();
263 let bytes = s.as_bytes();
264 let len = u32::try_from(bytes.len()).unwrap_or(u32::MAX);
265 hasher.update(&len.to_le_bytes());
266 hasher.update(bytes);
267}
268
269// ---------------------------------------------------------------------------
270// SourceRoot, MemberFolder, Classification
271// ---------------------------------------------------------------------------
272
273/// A directory that is auto-indexed by sqry. One `.sqry/graph/manifest.json`
274/// per source root.
275#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
276pub struct SourceRoot {
277 /// Canonical absolute path to the source root.
278 pub path: PathBuf,
279 /// Path to the per-source-root index manifest (always
280 /// `<path>/.sqry/graph/manifest.json`).
281 pub index_path: PathBuf,
282 /// Optional list of language hints to bias plugin selection.
283 pub language_hints: Option<Vec<String>>,
284 /// Optional path to the JVM classpath cache directory
285 /// (`<path>/.sqry/classpath/`); populated by the JVM pipeline.
286 pub classpath_dir: Option<PathBuf>,
287 /// Per-source-root override of the workspace-level
288 /// `config_fingerprint`. `0` here; populated by the plugin-selection
289 /// pipeline in a later step.
290 pub config_fingerprint: u64,
291}
292
293impl SourceRoot {
294 /// Build a `SourceRoot` from a canonical path, deriving the standard
295 /// `.sqry/graph/manifest.json` index path. `language_hints` is
296 /// `None`; `classpath_dir` is `None`; `config_fingerprint` is `0`.
297 #[must_use]
298 pub fn from_path(path: PathBuf) -> Self {
299 let index_path = path.join(".sqry").join("graph").join("manifest.json");
300 Self {
301 path,
302 index_path,
303 language_hints: None,
304 classpath_dir: None,
305 config_fingerprint: 0,
306 }
307 }
308
309 /// STEP_11_4 — populate [`Self::classpath_dir`] from a probe of
310 /// `<self.path>/.sqry/classpath/`. The directory is set when the
311 /// probe finds a *directory* at that path, leaving the field
312 /// `None` when the path is missing or is not a directory.
313 ///
314 /// Returns `Ok(())` on a successful (possibly negative) probe.
315 /// Returns the raw [`io::Error`] when the probe fails for a reason
316 /// other than `NotFound` (e.g. permission denied) so callers can
317 /// surface a [`super::cache::WorkspaceWarning::ClasspathProbeFailed`]
318 /// without losing the underlying error detail.
319 ///
320 /// # Errors
321 ///
322 /// Returns the underlying [`io::Error`] when [`fs::metadata`] fails
323 /// for a reason other than `NotFound`.
324 pub fn populate_classpath_dir(&mut self) -> io::Result<()> {
325 let probe = self.path.join(".sqry").join("classpath");
326 match fs::metadata(&probe) {
327 Ok(meta) if meta.is_dir() => {
328 self.classpath_dir = Some(probe);
329 Ok(())
330 }
331 Ok(_) => {
332 // Path exists but is not a directory — treat as
333 // "no classpath present" without raising an error.
334 self.classpath_dir = None;
335 Ok(())
336 }
337 Err(err) if err.kind() == io::ErrorKind::NotFound => {
338 self.classpath_dir = None;
339 Ok(())
340 }
341 Err(err) => Err(err),
342 }
343 }
344
345 /// STEP_11_4 — fluent builder for [`Self::config_fingerprint`].
346 ///
347 /// Used by call sites that hold a freshly computed
348 /// [`crate::config::compute_workspace_config_fingerprint`] value
349 /// alongside the source root. A fingerprint of `0` is the
350 /// "unset" sentinel and the builder accepts it for the same
351 /// reason `WorkspaceKey::config_fingerprint = 0` is the default
352 /// — call sites that want strict identity must supply a non-zero
353 /// value.
354 #[must_use]
355 pub fn with_config_fingerprint(mut self, fingerprint: u64) -> Self {
356 self.config_fingerprint = fingerprint;
357 self
358 }
359
360 /// STEP_11_4 — return the per-source-root config fingerprint with
361 /// fallback to a workspace-level default supplied by the caller.
362 ///
363 /// `SourceRoot.config_fingerprint == 0` is treated as "use the
364 /// workspace-level fingerprint" — the daemon `WorkspaceKey`
365 /// dimension this powers must always carry the workspace's
366 /// fingerprint when no per-source-root override is set so two
367 /// otherwise-identical paths under different workspaces stay in
368 /// distinct cache entries.
369 #[must_use]
370 pub fn effective_config_fingerprint(&self, workspace_default: u64) -> u64 {
371 if self.config_fingerprint == 0 {
372 workspace_default
373 } else {
374 self.config_fingerprint
375 }
376 }
377}
378
379/// Why a folder was classified as a member (rather than a source root or
380/// excluded path).
381#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
382#[serde(rename_all = "camelCase")]
383pub enum MemberReason {
384 /// Folder holds tooling / build / scripts, not first-class source.
385 OperationalFolder,
386 /// Folder exists but contains no plugin-recognized source files.
387 NonSourceFolder,
388 /// Heuristic could not match any registered language plugin
389 /// (last-resort default — see §1.1).
390 NoLanguagePluginMatch,
391}
392
393/// A folder that is part of the logical workspace but is **not**
394/// auto-indexed.
395#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
396pub struct MemberFolder {
397 /// Canonical absolute path to the member folder.
398 pub path: PathBuf,
399 /// Why the folder was classified as a member.
400 pub reason: MemberReason,
401}
402
403/// The result of [`LogicalWorkspace::classify`].
404#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
405#[serde(tag = "kind", rename_all = "camelCase")]
406pub enum Classification {
407 /// Path is a known source root or a descendant of one.
408 Source,
409 /// Path is a known member folder or a descendant of one.
410 Member {
411 /// Why the owning member folder was so classified.
412 reason: MemberReason,
413 },
414 /// Path was explicitly excluded.
415 Excluded,
416 /// Path is outside the logical workspace entirely.
417 Unknown,
418}
419
420/// Verdict returned by an injected heuristic classifier when a folder is
421/// not explicitly classified by the user.
422///
423/// The heuristic policy itself lives outside `sqry-core` (in the LSP /
424/// extension / wrapper); `sqry-core` accepts it as an injected
425/// `&dyn Fn(&Path) -> HeuristicVerdict` so policy stays separate from the
426/// data model.
427#[derive(Debug, Clone, PartialEq, Eq)]
428pub enum HeuristicVerdict {
429 /// Folder should be treated as a source root.
430 Source,
431 /// Folder should be treated as a member (with a specific reason).
432 Member {
433 /// Reason the folder is a member.
434 reason: MemberReason,
435 },
436 /// Folder should be excluded.
437 Excluded,
438 /// Heuristic could not classify; caller decides the last-resort
439 /// default.
440 Unknown,
441}
442
443// ---------------------------------------------------------------------------
444// LogicalWorkspace
445// ---------------------------------------------------------------------------
446
447/// A logical workspace — the unit of identity for cross-repo / workspace
448/// indexing.
449#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
450pub struct LogicalWorkspace {
451 identity: WorkspaceIdentity,
452 workspace_id: WorkspaceId,
453 source_roots: Vec<SourceRoot>,
454 member_folders: Vec<MemberFolder>,
455 exclusions: Vec<PathBuf>,
456 project_root_mode: ProjectRootMode,
457 index_root_override: Option<PathBuf>,
458 config_fingerprint: u64,
459}
460
461impl LogicalWorkspace {
462 /// Construct from a `.sqry-workspace` registry file.
463 ///
464 /// `WorkspaceRegistry::load` accepts both v1 (flat `repositories`
465 /// list) and v2 (`source_roots`, `member_folders`, `exclusions`,
466 /// `project_root_mode`) on-disk shapes — v1 is auto-upgraded to v2
467 /// in memory. This constructor projects every v2 field into the
468 /// resulting [`LogicalWorkspace`]:
469 ///
470 /// * `repositories` → `source_roots` (canonicalized).
471 /// * `member_folders` → `MemberFolder { path, reason }` (canonicalized).
472 /// * `exclusions` → canonical absolute paths.
473 /// * `project_root_mode` → carried verbatim.
474 ///
475 /// `STEP_7` codex iter4 fix — pre-iter4 this constructor dropped
476 /// `member_folders`, `exclusions`, and `project_root_mode` on the
477 /// floor, defeating acceptance criteria 5/6 end-to-end (the redactor
478 /// receives an empty `LogicalWorkspaceView::exclusions` /
479 /// `member_folders`, so `redact_excluded_in_passthrough` and the
480 /// member-folder prefix renderer never fire on real
481 /// `.sqry-workspace`-loaded sessions). The pre-iter4 inline TODO
482 /// pointed at "STEP_2 will overhaul the registry layer entirely" —
483 /// STEP_2 shipped the registry-side v2 schema but did not update this
484 /// projection. Fixed here so STEP_7's MCP redaction wiring is
485 /// observable end-to-end.
486 ///
487 /// # Errors
488 ///
489 /// Returns [`LogicalWorkspaceError`] when the registry file cannot be
490 /// loaded or any path canonicalization fails irrecoverably.
491 pub fn from_sqry_workspace(path: &Path) -> Result<Self, LogicalWorkspaceError> {
492 // Load the registry. v1 files are auto-upgraded to v2 in memory
493 // by `WorkspaceRegistry::load`; we propagate serde errors as a
494 // dedicated variant so callers can distinguish parse failures
495 // from IO.
496 let registry = WorkspaceRegistry::load(path).map_err(|err| match err {
497 super::error::WorkspaceError::Serialization(e) => {
498 LogicalWorkspaceError::ParseSqryWorkspace(e)
499 }
500 super::error::WorkspaceError::Io { source, .. } => LogicalWorkspaceError::Io(source),
501 other => LogicalWorkspaceError::Io(io::Error::other(other.to_string())),
502 })?;
503
504 let (canonical_path, symlink_unresolved) = canonicalize_with_flag(path)?;
505 let identity = WorkspaceIdentity::SqryWorkspaceFile {
506 path: maybe_lowercase(&canonical_path),
507 symlink_unresolved,
508 };
509 let workspace_id = WorkspaceId::from_identity(&identity);
510
511 let mut source_roots = Vec::with_capacity(registry.repositories.len());
512 for repo in ®istry.repositories {
513 let (canonical_repo, _unresolved) = canonicalize_with_flag(&repo.root)?;
514 let mut root = SourceRoot::from_path(canonical_repo);
515 // Preserve the registry-supplied index_path if it points at
516 // a real manifest (registry v1 uses `<repo>/.sqry-index`,
517 // not `.sqry/graph/manifest.json`); leave `from_path`'s
518 // computed manifest path otherwise.
519 root.index_path.clone_from(&repo.index_path);
520 if let Some(lang) = repo.primary_language.clone() {
521 root.language_hints = Some(vec![lang]);
522 }
523 source_roots.push(root);
524 }
525
526 // v2 projection: carry member_folders, exclusions, and
527 // project_root_mode through to the LogicalWorkspace so the MCP
528 // redactor (and any other consumer of `member_folders()` /
529 // `exclusions()`) sees the same structure the registry persists.
530 let mut member_folders = Vec::with_capacity(registry.member_folders.len());
531 for member in ®istry.member_folders {
532 let (canonical_root, _unresolved) = canonicalize_with_flag(&member.root)?;
533 member_folders.push(MemberFolder {
534 path: canonical_root,
535 reason: member.reason,
536 });
537 }
538
539 let mut exclusions = Vec::with_capacity(registry.exclusions.len());
540 for excluded in ®istry.exclusions {
541 let (canonical_excluded, _unresolved) = canonicalize_with_flag(excluded)?;
542 exclusions.push(canonical_excluded);
543 }
544
545 let mut ws = Self {
546 identity,
547 workspace_id,
548 source_roots,
549 member_folders,
550 exclusions,
551 project_root_mode: registry.project_root_mode,
552 index_root_override: None,
553 config_fingerprint: 0,
554 };
555 // STEP_11_4 — auto-populate classpath_dir on every source root.
556 let _failures = ws.populate_classpath_dirs();
557 Ok(ws)
558 }
559
560 /// Construct from a `.code-workspace` JSON file.
561 ///
562 /// The `heuristic_fn` is invoked for every folder that does not carry
563 /// an explicit `sqry.role`, is not in the top-level
564 /// `sqry.workspace.sourceRoots` / `.exclusions` overrides, and is not
565 /// already classified as a member by an explicit
566 /// `sqry.workspace.memberFolders` entry.
567 ///
568 /// # Errors
569 ///
570 /// Returns [`LogicalWorkspaceError`] for IO failures, JSON parse
571 /// errors, malformed folder entries, or path canonicalization
572 /// failures that cannot be recovered via lexical absolutization.
573 #[allow(clippy::too_many_lines)] // single-pass classifier; splitting hurts clarity.
574 pub fn from_code_workspace(
575 workspace_file: &Path,
576 heuristic_fn: &dyn Fn(&Path) -> HeuristicVerdict,
577 ) -> Result<Self, LogicalWorkspaceError> {
578 let bytes = fs::read(workspace_file)?;
579 let json: serde_json::Value =
580 serde_json::from_slice(&bytes).map_err(LogicalWorkspaceError::ParseCodeWorkspace)?;
581
582 let workspace_dir = workspace_file
583 .parent()
584 .map_or_else(|| PathBuf::from("."), Path::to_path_buf);
585
586 // Resolve the canonical workspace-file path for identity.
587 let (canonical_workspace_file, symlink_unresolved) =
588 canonicalize_with_flag(workspace_file)?;
589 let identity = WorkspaceIdentity::VsCodeWorkspaceFile {
590 path: maybe_lowercase(&canonical_workspace_file),
591 symlink_unresolved,
592 };
593 let workspace_id = WorkspaceId::from_identity(&identity);
594
595 // Collect folder entries. Per the .code-workspace spec each
596 // folder has a `path` (required) and optional `name`, plus
597 // sqry-specific `sqry.role`.
598 let folders_v = json.get("folders").cloned().unwrap_or_default();
599 let folders_arr = folders_v.as_array().cloned().unwrap_or_default();
600
601 // Top-level sqry.workspace overrides.
602 let sqry_top = json.get("sqry.workspace");
603 let top_source_roots = path_set_from_value(sqry_top, "sourceRoots", &workspace_dir);
604 let top_exclusions = path_set_from_value(sqry_top, "exclusions", &workspace_dir);
605 let top_members = member_overrides_from_value(sqry_top, &workspace_dir)?;
606 let project_root_mode = sqry_top
607 .and_then(|v| v.get("projectRootMode"))
608 .and_then(|v| v.as_str())
609 .and_then(ProjectRootMode::from_str_opt)
610 .unwrap_or_default();
611
612 // Build per-path classification map. The key is the absolute
613 // path *as configured*; we canonicalize at the end.
614 let mut classified: BTreeMap<PathBuf, FolderClassKind> = BTreeMap::new();
615 let mut all_folders: Vec<PathBuf> = Vec::new();
616
617 for (idx, entry) in folders_arr.iter().enumerate() {
618 let raw_path = entry.get("path").and_then(|v| v.as_str()).ok_or_else(|| {
619 LogicalWorkspaceError::MalformedFolderEntry {
620 reason: format!("folders[{idx}] missing string `path`"),
621 }
622 })?;
623 let abs = if Path::new(raw_path).is_absolute() {
624 PathBuf::from(raw_path)
625 } else {
626 workspace_dir.join(raw_path)
627 };
628 all_folders.push(abs.clone());
629
630 // Step 4: explicit per-folder `sqry.role` always wins.
631 if let Some(role) = entry.get("sqry.role").and_then(|v| v.as_str()) {
632 let kind = match role {
633 "source" => FolderClassKind::Source,
634 "operational" => FolderClassKind::Member(MemberReason::OperationalFolder),
635 "non-source" | "nonSource" | "non_source" => {
636 FolderClassKind::Member(MemberReason::NonSourceFolder)
637 }
638 "excluded" => FolderClassKind::Excluded,
639 other => {
640 return Err(LogicalWorkspaceError::MalformedFolderEntry {
641 reason: format!(
642 "folders[{idx}].sqry.role = '{other}' (expected source|operational|excluded|non-source)"
643 ),
644 });
645 }
646 };
647 classified.insert(abs, kind);
648 continue;
649 }
650
651 // Step 5: top-level sqry.workspace overrides.
652 if top_exclusions.contains(&abs) {
653 classified.insert(abs, FolderClassKind::Excluded);
654 continue;
655 }
656 if top_source_roots.contains(&abs) {
657 classified.insert(abs, FolderClassKind::Source);
658 continue;
659 }
660 if let Some(reason) = top_members.get(&abs).copied() {
661 classified.insert(abs, FolderClassKind::Member(reason));
662 continue;
663 }
664
665 // Step 6: heuristic fallback.
666 let verdict = heuristic_fn(&abs);
667 let kind = match verdict {
668 HeuristicVerdict::Source => FolderClassKind::Source,
669 HeuristicVerdict::Member { reason } => FolderClassKind::Member(reason),
670 HeuristicVerdict::Excluded => FolderClassKind::Excluded,
671 HeuristicVerdict::Unknown => {
672 // Step 7: last-resort default for unclassified folders.
673 FolderClassKind::Member(MemberReason::NoLanguagePluginMatch)
674 }
675 };
676 classified.insert(abs, kind);
677 }
678
679 // Top-level overrides may reference paths that were not present in
680 // the `folders[]` array. Honor them too.
681 for path in &top_source_roots {
682 classified
683 .entry(path.clone())
684 .or_insert(FolderClassKind::Source);
685 }
686 for path in &top_exclusions {
687 classified
688 .entry(path.clone())
689 .or_insert(FolderClassKind::Excluded);
690 }
691 for (path, reason) in &top_members {
692 classified
693 .entry(path.clone())
694 .or_insert(FolderClassKind::Member(*reason));
695 }
696
697 // Materialize.
698 let mut source_roots = Vec::new();
699 let mut member_folders = Vec::new();
700 let mut exclusions = Vec::new();
701 for (raw_path, kind) in classified {
702 let (canonical, _unresolved) = canonicalize_with_flag(&raw_path)?;
703 let canonical = maybe_lowercase(&canonical);
704 match kind {
705 FolderClassKind::Source => source_roots.push(SourceRoot::from_path(canonical)),
706 FolderClassKind::Member(reason) => member_folders.push(MemberFolder {
707 path: canonical,
708 reason,
709 }),
710 FolderClassKind::Excluded => exclusions.push(canonical),
711 }
712 }
713
714 let mut ws = Self {
715 identity,
716 workspace_id,
717 source_roots,
718 member_folders,
719 exclusions,
720 project_root_mode,
721 index_root_override: None,
722 config_fingerprint: 0,
723 };
724 let _failures = ws.populate_classpath_dirs();
725 Ok(ws)
726 }
727
728 /// Construct an ad-hoc multi-root workspace (every folder is a source
729 /// root). Folders are sorted lexically before hashing so identity is
730 /// stable under reorder.
731 ///
732 /// # Errors
733 ///
734 /// Returns [`LogicalWorkspaceError`] if any folder cannot be
735 /// canonicalized irrecoverably.
736 #[allow(clippy::needless_pass_by_value)] // owning constructor.
737 pub fn anonymous_multi_root(folders: Vec<PathBuf>) -> Result<Self, LogicalWorkspaceError> {
738 let mut canonical_folders = Vec::with_capacity(folders.len());
739 let mut symlink_unresolved = false;
740 for folder in &folders {
741 let (canon, unresolved) = canonicalize_with_flag(folder)?;
742 symlink_unresolved |= unresolved;
743 canonical_folders.push(maybe_lowercase(&canon));
744 }
745 canonical_folders.sort();
746 let identity = WorkspaceIdentity::AnonymousMultiRoot {
747 folders: canonical_folders.clone(),
748 symlink_unresolved,
749 };
750 let workspace_id = WorkspaceId::from_identity(&identity);
751
752 let source_roots = canonical_folders
753 .iter()
754 .cloned()
755 .map(SourceRoot::from_path)
756 .collect();
757
758 let mut ws = Self {
759 identity,
760 workspace_id,
761 source_roots,
762 member_folders: Vec::new(),
763 exclusions: Vec::new(),
764 project_root_mode: ProjectRootMode::default(),
765 index_root_override: None,
766 config_fingerprint: 0,
767 };
768 let _failures = ws.populate_classpath_dirs();
769 Ok(ws)
770 }
771
772 /// Construct a single-root workspace (one source root, no members).
773 ///
774 /// # Errors
775 ///
776 /// Returns [`LogicalWorkspaceError`] if `path` cannot be canonicalized
777 /// irrecoverably.
778 #[allow(clippy::needless_pass_by_value)] // owning constructor.
779 pub fn single_root(path: PathBuf) -> Result<Self, LogicalWorkspaceError> {
780 let (canonical, symlink_unresolved) = canonicalize_with_flag(&path)?;
781 let canonical = maybe_lowercase(&canonical);
782 let identity = WorkspaceIdentity::SingleRoot {
783 path: canonical.clone(),
784 symlink_unresolved,
785 };
786 let workspace_id = WorkspaceId::from_identity(&identity);
787 let mut ws = Self {
788 identity,
789 workspace_id,
790 source_roots: vec![SourceRoot::from_path(canonical)],
791 member_folders: Vec::new(),
792 exclusions: Vec::new(),
793 project_root_mode: ProjectRootMode::default(),
794 index_root_override: None,
795 config_fingerprint: 0,
796 };
797 let _failures = ws.populate_classpath_dirs();
798 Ok(ws)
799 }
800
801 /// Test-only seam: construct a single-root workspace with the
802 /// case-sensitivity decision *forced* to `case_insensitive`,
803 /// bypassing live mount detection. Used by the
804 /// `case_insensitive_mount_produces_same_id_end_to_end` test to
805 /// exercise acceptance criterion 4 deterministically on
806 /// case-sensitive Linux hosts (where the live detector would
807 /// otherwise return `false` and short-circuit the lowercase path).
808 ///
809 /// The path is canonicalized via `path_utils::canonicalize_path`
810 /// for parity with [`Self::single_root`], but the case-folding
811 /// step uses the explicit `case_insensitive` argument instead of
812 /// `is_case_insensitive_mount`.
813 #[cfg(test)]
814 #[allow(clippy::needless_pass_by_value)]
815 pub(crate) fn single_root_with_case_sensitivity(
816 path: PathBuf,
817 case_insensitive: bool,
818 ) -> Result<Self, LogicalWorkspaceError> {
819 let (canonical, symlink_unresolved) = canonicalize_with_flag(&path)?;
820 let canonical = if case_insensitive {
821 PathBuf::from(canonical.to_string_lossy().to_lowercase())
822 } else {
823 canonical
824 };
825 let identity = WorkspaceIdentity::SingleRoot {
826 path: canonical.clone(),
827 symlink_unresolved,
828 };
829 let workspace_id = WorkspaceId::from_identity(&identity);
830 Ok(Self {
831 identity,
832 workspace_id,
833 source_roots: vec![SourceRoot::from_path(canonical)],
834 member_folders: Vec::new(),
835 exclusions: Vec::new(),
836 project_root_mode: ProjectRootMode::default(),
837 index_root_override: None,
838 config_fingerprint: 0,
839 })
840 }
841
842 // -- Accessors --
843
844 /// The stable BLAKE3-256 identity of this workspace.
845 #[must_use]
846 pub fn workspace_id(&self) -> &WorkspaceId {
847 &self.workspace_id
848 }
849
850 /// The identity inputs that produced [`Self::workspace_id`].
851 #[must_use]
852 pub fn identity(&self) -> &WorkspaceIdentity {
853 &self.identity
854 }
855
856 /// The auto-indexed source roots.
857 #[must_use]
858 pub fn source_roots(&self) -> &[SourceRoot] {
859 &self.source_roots
860 }
861
862 /// The non-indexed member folders.
863 #[must_use]
864 pub fn member_folders(&self) -> &[MemberFolder] {
865 &self.member_folders
866 }
867
868 /// Explicitly excluded paths.
869 #[must_use]
870 pub fn exclusions(&self) -> &[PathBuf] {
871 &self.exclusions
872 }
873
874 /// The workspace-level [`ProjectRootMode`].
875 #[must_use]
876 pub fn project_root_mode(&self) -> ProjectRootMode {
877 self.project_root_mode
878 }
879
880 /// Optional `--index-root` override.
881 #[must_use]
882 pub fn index_root_override(&self) -> Option<&Path> {
883 self.index_root_override.as_deref()
884 }
885
886 /// Workspace-level config fingerprint. Populated by the
887 /// plugin-selection / cost-tier pipeline via
888 /// [`Self::set_config_fingerprint`] and consumed by
889 /// `sqry-daemon::WorkspaceKey` so two source roots sharing path
890 /// but differing fingerprint stay in distinct cache entries.
891 #[must_use]
892 pub fn config_fingerprint(&self) -> u64 {
893 self.config_fingerprint
894 }
895
896 /// STEP_11_4 — set the workspace-level config fingerprint computed
897 /// via [`crate::config::compute_workspace_config_fingerprint`].
898 ///
899 /// The fingerprint is **not** part of the [`WorkspaceId`] hash
900 /// input — it is a separate cache dimension consumed by the
901 /// daemon's `WorkspaceKey`. Two `LogicalWorkspace`s with the
902 /// same identity but different fingerprints share an identity but
903 /// produce distinct daemon cache entries.
904 pub fn set_config_fingerprint(&mut self, fingerprint: u64) {
905 self.config_fingerprint = fingerprint;
906 }
907
908 /// STEP_11_4 — set the workspace-level config fingerprint and
909 /// propagate it to every [`SourceRoot`] that does not already
910 /// carry an explicit per-root override (i.e. whose
911 /// `config_fingerprint == 0`).
912 ///
913 /// This is the typical wiring point: callers compute one
914 /// workspace-level fingerprint, then call
915 /// `set_config_fingerprint_with_inheritance` so source roots
916 /// without an explicit override inherit the workspace value.
917 /// Source roots that carry a non-zero override are left
918 /// untouched.
919 pub fn set_config_fingerprint_with_inheritance(&mut self, fingerprint: u64) {
920 self.config_fingerprint = fingerprint;
921 for root in &mut self.source_roots {
922 if root.config_fingerprint == 0 {
923 root.config_fingerprint = fingerprint;
924 }
925 }
926 }
927
928 /// STEP_11_4 — populate every [`SourceRoot::classpath_dir`] in this
929 /// workspace by probing `<root>/.sqry/classpath/` for each. Returns
930 /// a vector of `(source_root, io::Error)` pairs for any probe that
931 /// failed for a reason other than `NotFound`; callers typically
932 /// fold these into [`super::cache::WorkspaceWarning::ClasspathProbeFailed`].
933 pub fn populate_classpath_dirs(&mut self) -> Vec<(PathBuf, io::Error)> {
934 let mut failures = Vec::new();
935 for root in &mut self.source_roots {
936 if let Err(err) = root.populate_classpath_dir() {
937 failures.push((root.path.clone(), err));
938 }
939 }
940 failures
941 }
942
943 /// Returns `true` if `path` matches one of the registered source
944 /// roots exactly (not a descendant).
945 #[must_use]
946 pub fn is_source_root(&self, path: &Path) -> bool {
947 let canonical =
948 canonicalize_path(path).map_or_else(|_| path.to_path_buf(), |p| maybe_lowercase(&p));
949 self.source_roots.iter().any(|r| r.path == canonical)
950 }
951
952 /// Classify a path against the workspace per §1.4 of the
953 /// implementation plan.
954 #[must_use]
955 pub fn classify(&self, path: &Path) -> Classification {
956 let canonical =
957 canonicalize_path(path).map_or_else(|_| path.to_path_buf(), |p| maybe_lowercase(&p));
958
959 // 1. Exclusion match (exact or descendant).
960 if self
961 .exclusions
962 .iter()
963 .any(|excl| path_matches(&canonical, excl))
964 {
965 return Classification::Excluded;
966 }
967
968 // 2. Source root or descendant of one.
969 if self
970 .source_roots
971 .iter()
972 .any(|r| path_matches(&canonical, &r.path))
973 {
974 return Classification::Source;
975 }
976
977 // 3. Member folder or descendant of one.
978 for member in &self.member_folders {
979 if path_matches(&canonical, &member.path) {
980 return Classification::Member {
981 reason: member.reason,
982 };
983 }
984 }
985
986 // 4. Outside the logical workspace entirely.
987 Classification::Unknown
988 }
989}
990
991// ---------------------------------------------------------------------------
992// Helpers
993// ---------------------------------------------------------------------------
994
995/// Internal classifier used while building from a `.code-workspace`.
996#[derive(Debug, Clone, Copy)]
997enum FolderClassKind {
998 Source,
999 Member(MemberReason),
1000 Excluded,
1001}
1002
1003/// `true` if `path == prefix` or `path` is a descendant of `prefix`.
1004fn path_matches(path: &Path, prefix: &Path) -> bool {
1005 path == prefix || path.starts_with(prefix)
1006}
1007
1008/// Canonicalize a path and report whether the filesystem could resolve it.
1009///
1010/// The actual canonicalization is delegated to
1011/// [`crate::project::path_utils::canonicalize_path`] — the project-wide
1012/// source of truth which already handles the `realpath(3)` / lexical
1013/// fallback split. The `symlink_unresolved` flag is derived from a
1014/// separate `std::fs::canonicalize(path).is_ok()` probe purely so the
1015/// caller can record in the identity inputs whether the canonical path
1016/// came from the live filesystem or from the lexical fallback.
1017fn canonicalize_with_flag(path: &Path) -> Result<(PathBuf, bool), LogicalWorkspaceError> {
1018 // Probe whether realpath(3) would have succeeded. We deliberately
1019 // do NOT use the resulting path — the canonical path itself is
1020 // produced by `path_utils::canonicalize_path` so the entire
1021 // workspace stack uses one source-of-truth canonicalizer.
1022 let real_canon_succeeded = fs::canonicalize(path).is_ok();
1023
1024 let canonical =
1025 canonicalize_path(path).map_err(|source| LogicalWorkspaceError::Canonicalization {
1026 path: path.to_path_buf(),
1027 source,
1028 })?;
1029
1030 Ok((canonical, !real_canon_succeeded))
1031}
1032
1033/// Apply best-effort case-insensitive normalization. On case-sensitive
1034/// mounts this is a no-op. On case-insensitive mounts we lowercase the
1035/// path so case-variant inputs collapse to the same `WorkspaceId`.
1036fn maybe_lowercase(path: &Path) -> PathBuf {
1037 if is_case_insensitive_mount(path) {
1038 let s = path.to_string_lossy().to_lowercase();
1039 PathBuf::from(s)
1040 } else {
1041 path.to_path_buf()
1042 }
1043}
1044
1045/// Best-effort detection of whether `path` lives on a case-insensitive
1046/// mount. We avoid platform-specific `statvfs` plumbing here; the
1047/// detection is conservative.
1048///
1049/// - If `path` exists and a lowercase variant is present and
1050/// round-trips to the same canonical path, the mount is treated as
1051/// case-insensitive.
1052/// - On Linux the kernel default is case-sensitive; the round-trip
1053/// check therefore returns `false` for almost all paths.
1054/// - On macOS HFS+/APFS (default case-insensitive) and Windows
1055/// NTFS/ReFS the round-trip succeeds and we lowercase.
1056///
1057/// The algorithm never panics and never blocks on slow IO — it does at
1058/// most two `metadata()` calls.
1059fn is_case_insensitive_mount(path: &Path) -> bool {
1060 // Find a path component we can mutate. If the path string contains
1061 // no ASCII alphabetic characters there is nothing to vary, so
1062 // assume case-sensitive.
1063 let s = path.to_string_lossy();
1064 if !s.chars().any(|c| c.is_ascii_alphabetic()) {
1065 return false;
1066 }
1067 // Cheap fast path: try the lowercased and uppercased variants and
1068 // see whether both resolve to the same metadata as the original.
1069 let Ok(orig) = fs::metadata(path) else {
1070 return false;
1071 };
1072 let lower = PathBuf::from(s.to_lowercase());
1073 let upper = PathBuf::from(s.to_uppercase());
1074
1075 let lower_ok = fs::metadata(&lower)
1076 .ok()
1077 .filter(|m| same_inode(m, &orig))
1078 .is_some();
1079 let upper_ok = fs::metadata(&upper)
1080 .ok()
1081 .filter(|m| same_inode(m, &orig))
1082 .is_some();
1083
1084 // We require *both* round-trips to succeed (and at least one of them
1085 // to actually be a different string than the original — otherwise
1086 // the test is trivially true even on case-sensitive FS where
1087 // `path == s.to_lowercase()` already).
1088 let varies = lower != path || upper != path;
1089 varies && lower_ok && upper_ok
1090}
1091
1092#[cfg(unix)]
1093fn same_inode(a: &fs::Metadata, b: &fs::Metadata) -> bool {
1094 use std::os::unix::fs::MetadataExt;
1095 a.ino() == b.ino() && a.dev() == b.dev()
1096}
1097
1098#[cfg(not(unix))]
1099fn same_inode(a: &fs::Metadata, b: &fs::Metadata) -> bool {
1100 // Best-effort on non-Unix: fall back to size + modified-time
1101 // equality. This is conservative — false positives would only
1102 // cause a case-insensitive lowercase-pass on a case-sensitive
1103 // mount, which is harmless for identity stability since both
1104 // case variants would already be the same path.
1105 a.len() == b.len() && a.modified().ok() == b.modified().ok()
1106}
1107
1108/// Parse a `sqry.workspace.<key>` string array into a set of absolute
1109/// paths anchored at `base_dir`.
1110fn path_set_from_value(
1111 sqry_top: Option<&serde_json::Value>,
1112 key: &str,
1113 base_dir: &Path,
1114) -> std::collections::BTreeSet<PathBuf> {
1115 let mut set = std::collections::BTreeSet::new();
1116 let Some(top) = sqry_top else { return set };
1117 let Some(arr) = top.get(key).and_then(|v| v.as_array()) else {
1118 return set;
1119 };
1120 for item in arr {
1121 if let Some(s) = item.as_str() {
1122 let p = if Path::new(s).is_absolute() {
1123 PathBuf::from(s)
1124 } else {
1125 base_dir.join(s)
1126 };
1127 set.insert(p);
1128 }
1129 }
1130 set
1131}
1132
1133/// Parse `sqry.workspace.memberFolders`: either a `["path", ...]` array
1134/// (defaults to `OperationalFolder`) or an array of objects
1135/// `{ "path": "...", "reason": "operational" }`.
1136fn member_overrides_from_value(
1137 sqry_top: Option<&serde_json::Value>,
1138 base_dir: &Path,
1139) -> Result<BTreeMap<PathBuf, MemberReason>, LogicalWorkspaceError> {
1140 let mut map = BTreeMap::new();
1141 let Some(top) = sqry_top else { return Ok(map) };
1142 let Some(arr) = top.get("memberFolders").and_then(|v| v.as_array()) else {
1143 return Ok(map);
1144 };
1145 for (idx, item) in arr.iter().enumerate() {
1146 let (path_str, reason) = if let Some(s) = item.as_str() {
1147 (s.to_string(), MemberReason::OperationalFolder)
1148 } else if let Some(obj) = item.as_object() {
1149 let path = obj
1150 .get("path")
1151 .and_then(|v| v.as_str())
1152 .ok_or_else(|| LogicalWorkspaceError::MalformedFolderEntry {
1153 reason: format!(
1154 "sqry.workspace.memberFolders[{idx}] object missing string `path`"
1155 ),
1156 })?
1157 .to_string();
1158 // The "operational" and `_` arms intentionally share a body:
1159 // explicit "operational" is documented; unknown strings fall
1160 // back to the same default. Keep the arms separated so the
1161 // explicit-keyword behaviour is visible in code review.
1162 #[allow(clippy::match_same_arms)]
1163 let reason = obj.get("reason").and_then(|v| v.as_str()).map_or(
1164 MemberReason::OperationalFolder,
1165 |s| match s {
1166 "operational" => MemberReason::OperationalFolder,
1167 "non-source" | "nonSource" | "non_source" => MemberReason::NonSourceFolder,
1168 "noLanguagePluginMatch" | "no-language-plugin-match" => {
1169 MemberReason::NoLanguagePluginMatch
1170 }
1171 _ => MemberReason::OperationalFolder,
1172 },
1173 );
1174 (path, reason)
1175 } else {
1176 return Err(LogicalWorkspaceError::MalformedFolderEntry {
1177 reason: format!(
1178 "sqry.workspace.memberFolders[{idx}] is neither a string nor an object"
1179 ),
1180 });
1181 };
1182 let abs = if Path::new(&path_str).is_absolute() {
1183 PathBuf::from(&path_str)
1184 } else {
1185 base_dir.join(&path_str)
1186 };
1187 map.insert(abs, reason);
1188 }
1189 Ok(map)
1190}