1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
//! File node operations for CodeGraph
//!
//! Handles file node CRUD operations and in-memory file indexing.
//!
//! # Thread Safety
//!
//! **This module is NOT thread-safe.**
//!
//! `FileOps` is designed for single-threaded use only:
//! - All methods require `&mut self` (exclusive access)
//! - `file_index: HashMap` has no synchronization primitives
//! - No `Send` or `Sync` impls
//!
//! # Usage Pattern
//!
//! `FileOps` is accessed exclusively through `CodeGraph`, which
//! enforces single-threaded access. The parent `CodeGraph` instance
//! must not be shared across threads.
//!
//! For concurrent file operations, use external synchronization
//! (e.g., mutex wrapper around CodeGraph).
use anyhow::Result;
use sqlitegraph::{GraphBackend, NodeId, NodeSpec, SnapshotId};
use std::collections::HashMap;
use std::hash::Hasher;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::{SystemTime, UNIX_EPOCH};
use xxhash_rust::xxh64::Xxh64;
use crate::graph::schema::FileNode;
use crate::ingest::{SymbolFact, SymbolKind};
/// File operations for CodeGraph
pub struct FileOps {
pub backend: Arc<dyn GraphBackend>,
pub file_index: HashMap<String, NodeId>,
}
/// Normalize a path to absolute form for consistent indexing
///
/// This ensures paths stored in file_index match between:
/// - find_or_create_file_node() (during indexing)
/// - rebuild_file_index() (during database open)
/// - resolve_query_path() (during queries)
///
/// Note: Does NOT canonicalize (file doesn't need to exist). Just makes relative
/// paths absolute from current directory.
///
/// # Arguments
/// * `path` - The path to normalize (may be relative or absolute)
///
/// # Returns
/// Absolute path string
pub(crate) fn normalize_path_for_index(path: &str) -> String {
let path_buf = PathBuf::from(path);
if path_buf.is_absolute() {
return path.to_string();
}
// Relative path: make absolute from current directory (don't canonicalize - file may not exist)
if let Ok(cwd) = std::env::current_dir() {
return cwd.join(&path_buf).to_string_lossy().to_string();
}
// Fallback: return as-is
path.to_string()
}
impl FileOps {
/// Get current Unix timestamp in seconds
fn now() -> i64 {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_secs() as i64
}
/// Get filesystem modification time for a file path
///
/// Returns 0 if file doesn't exist or mtime cannot be read
fn get_file_mtime(path: &str) -> i64 {
std::fs::metadata(path)
.and_then(|m| m.modified())
.and_then(|t| t.duration_since(UNIX_EPOCH).map_err(std::io::Error::other))
.map(|d| d.as_secs() as i64)
.unwrap_or(0)
}
/// Find file node by path, checking in-memory index
///
/// Note: file_index is populated when CodeGraph opens, so this
/// should find all existing File nodes. Returns None if not found.
pub fn find_file_node(&mut self, path: &str) -> Result<Option<NodeId>> {
// Normalize path to match how files are stored after index_file
let normalized_path = normalize_path_for_index(path);
Ok(self.file_index.get(&normalized_path).copied())
}
/// Find ALL file nodes matching a path by scanning the database.
///
/// Unlike `find_file_node` which uses the in-memory HashMap (only holds one entry
/// per path), this scans all entities and returns every File node whose path
/// matches. Use this when cleaning up duplicates.
pub fn find_all_file_nodes(&self, path: &str) -> Result<Vec<(NodeId, FileNode)>> {
let normalized_path = normalize_path_for_index(path);
let mut results = Vec::new();
let ids = self.backend.entity_ids()?;
let snapshot = SnapshotId::current();
for id in ids {
let node = match self.backend.get_node(snapshot, id) {
Ok(n) => n,
Err(_) => continue,
};
if node.kind == "File" {
if let Ok(file_node) = serde_json::from_value::<FileNode>(node.data) {
let stored_path = normalize_path_for_index(&file_node.path);
if stored_path == normalized_path {
results.push((NodeId::from(id), file_node));
}
}
}
}
Ok(results)
}
/// Find existing file node or create new one.
///
/// If multiple file nodes exist with the same path (duplicates from earlier
/// indexing bugs), all are deleted before creating the new one.
pub fn find_or_create_file_node(&mut self, path: &str, hash: &str) -> Result<NodeId> {
let now = Self::now();
let mtime = Self::get_file_mtime(path);
// Normalize path to absolute canonical form for consistent indexing
let normalized_path = normalize_path_for_index(path);
// Find ALL file nodes with this path (not just the one in file_index)
let all_existing = self.find_all_file_nodes(&normalized_path)?;
if !all_existing.is_empty() {
// If duplicates exist, delete all of them and their edges before creating fresh
if all_existing.len() > 1 {
for (old_id, _) in &all_existing {
let _ = self.backend.delete_entity(old_id.as_i64());
}
self.file_index.remove(&normalized_path);
}
// Use the first (or only) existing node's metadata as baseline
let id = all_existing[0].0;
let snapshot = SnapshotId::current();
let node = self.backend.get_node(snapshot, id.as_i64())?;
// Parse existing FileNode, update hash and timestamps, serialize back
let mut file_node: FileNode =
serde_json::from_value(node.data.clone()).unwrap_or_else(|_| FileNode {
path: path.to_string(),
hash: hash.to_string(),
last_indexed_at: now,
last_modified: mtime,
});
file_node.hash = hash.to_string();
file_node.last_indexed_at = now;
file_node.last_modified = mtime;
let updated_data = serde_json::to_value(file_node)?;
// Create new NodeSpec with updated data
let node_spec = NodeSpec {
kind: "File".to_string(),
name: normalized_path.to_string(),
file_path: Some(normalized_path.to_string()),
data: updated_data,
};
// Delete old node and insert new one (sqlitegraph doesn't support update)
self.backend.delete_entity(id.as_i64())?;
let new_id = self.backend.insert_node(node_spec)?;
let new_node_id = NodeId::from(new_id);
// Update index with normalized path
self.file_index
.insert(normalized_path.to_string(), new_node_id);
Ok(new_node_id)
} else {
// Create new file node with timestamps
let file_node = FileNode {
path: normalized_path.to_string(),
hash: hash.to_string(),
last_indexed_at: now,
last_modified: mtime,
};
let node_spec = NodeSpec {
kind: "File".to_string(),
name: normalized_path.to_string(),
file_path: Some(normalized_path.to_string()),
data: serde_json::to_value(file_node)?,
};
let id = self.backend.insert_node(node_spec)?;
let node_id = NodeId::from(id);
// Update index with normalized path
self.file_index.insert(normalized_path.to_string(), node_id);
Ok(node_id)
}
}
/// Rebuild in-memory file index by scanning all nodes
pub fn rebuild_file_index(&mut self) -> Result<()> {
self.file_index.clear();
// Get all entity IDs from the backend
let ids = self.backend.entity_ids()?;
let snapshot = SnapshotId::current();
for id in ids {
let node = match self.backend.get_node(snapshot, id) {
Ok(n) => n,
Err(_) => continue,
};
if node.kind == "File" {
if let Ok(file_node) = serde_json::from_value::<FileNode>(node.data) {
// Normalize path to match normalize_path_for_index() format
let normalized_path = normalize_path_for_index(&file_node.path);
self.file_index.insert(normalized_path, NodeId::from(id));
}
}
}
Ok(())
}
/// Compute xxHash64 of file contents
pub fn compute_hash(&self, source: &[u8]) -> String {
let mut hasher = Xxh64::new(0);
hasher.write(source);
format!("{:016x}", hasher.finish())
}
/// Convert a symbol node to SymbolFact
pub fn symbol_fact_from_node(
&self,
node_id: i64,
file_path: std::path::PathBuf,
) -> Result<Option<SymbolFact>> {
let snapshot = SnapshotId::current();
let node = self.backend.get_node(snapshot, node_id)?;
let symbol_node: Option<crate::graph::schema::SymbolNode> =
serde_json::from_value(node.data).ok();
let symbol_node = match symbol_node {
Some(n) => n,
None => return Ok(None),
};
let kind = match symbol_node.kind.as_str() {
"Function" => SymbolKind::Function,
"Method" => SymbolKind::Method,
"Class" => SymbolKind::Class,
"Interface" => SymbolKind::Interface,
"Enum" => SymbolKind::Enum,
"Module" => SymbolKind::Module,
"Union" => SymbolKind::Union,
"Namespace" => SymbolKind::Namespace,
"TypeAlias" => SymbolKind::TypeAlias,
"Unknown" => SymbolKind::Unknown,
_ => SymbolKind::Unknown,
};
let normalized_kind = match symbol_node.kind_normalized.clone() {
Some(value) => value,
None => kind.normalized_key().to_string(),
};
Ok(Some(SymbolFact {
file_path,
kind,
kind_normalized: normalized_kind,
name: symbol_node.name.clone(),
fqn: symbol_node.fqn,
canonical_fqn: None,
display_fqn: None,
byte_start: symbol_node.byte_start,
byte_end: symbol_node.byte_end,
start_line: symbol_node.start_line,
start_col: symbol_node.start_col,
end_line: symbol_node.end_line,
end_col: symbol_node.end_col,
}))
}
/// Get the FileNode for a given file path
///
/// # Arguments
/// * `path` - File path to query
///
/// # Returns
/// Option<FileNode> with file metadata including timestamps, or None if not found
pub fn get_file_node(&mut self, path: &str) -> Result<Option<FileNode>> {
let node_id = match self.find_file_node(path)? {
Some(id) => id,
None => return Ok(None),
};
let snapshot = SnapshotId::current();
let entity = self.backend.get_node(snapshot, node_id.as_i64())?;
let file_node: FileNode = serde_json::from_value(entity.data)?;
Ok(Some(file_node))
}
/// Get all FileNodes from the database
///
/// # Returns
/// HashMap of file path -> FileNode for all files in the database
pub fn all_file_nodes(&mut self) -> Result<std::collections::HashMap<String, FileNode>> {
self.all_file_nodes_readonly()
}
/// Get all FileNodes from the database (read-only, doesn't rebuild index)
///
/// # Returns
/// HashMap of file path -> FileNode for all files in the database
pub fn all_file_nodes_readonly(&self) -> Result<std::collections::HashMap<String, FileNode>> {
use std::collections::HashMap;
let mut result = HashMap::new();
let entity_ids = self.backend.entity_ids()?;
let snapshot = SnapshotId::current();
for id in entity_ids {
let entity = self.backend.get_node(snapshot, id)?;
if entity.kind == "File" {
if let Ok(file_node) = serde_json::from_value::<FileNode>(entity.data) {
result.insert(file_node.path.clone(), file_node);
}
}
}
Ok(result)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_compute_hash_deterministic() {
let graph = crate::CodeGraph::open(":memory:").unwrap();
let ops = graph.files;
let data = b"fn main() { println!(\"hello\"); }";
let hash1 = ops.compute_hash(data);
let hash2 = ops.compute_hash(data);
assert_eq!(hash1, hash2, "Hash should be deterministic");
assert_eq!(hash1.len(), 16, "xxHash64 produces 16 hex chars");
}
#[test]
fn test_compute_hash_different_inputs() {
let graph = crate::CodeGraph::open(":memory:").unwrap();
let ops = graph.files;
let hash1 = ops.compute_hash(b"fn a() {}");
let hash2 = ops.compute_hash(b"fn b() {}");
assert_ne!(
hash1, hash2,
"Different inputs should produce different hashes"
);
}
}