1use anyhow::{anyhow, Context, Result};
2use regex::Regex;
3use serde::{Deserialize, Serialize};
4use serde_json::json;
5use sha2::{Digest, Sha256};
6use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
7use std::fs;
8use std::path::{Path, PathBuf};
9use std::str::FromStr;
10use tree_sitter::{Language, Node, Parser};
11use ucm_core::{
12 normalize::{canonical_json, normalize_content},
13 Block, BlockId, Content, Document, DocumentId, DocumentMetadata, Edge, EdgeType,
14};
15use ucp_llm::IdMapper;
16
17pub const CODEGRAPH_PROFILE: &str = "codegraph";
18pub const CODEGRAPH_PROFILE_VERSION: &str = "v1";
19pub const CODEGRAPH_PROFILE_MARKER: &str = "codegraph.v1";
20pub const CODEGRAPH_EXTRACTOR_VERSION: &str = "ucp-codegraph-extractor.v1";
21
22const META_NODE_CLASS: &str = "node_class";
23const META_LOGICAL_KEY: &str = "logical_key";
24const META_PATH: &str = "path";
25const META_LANGUAGE: &str = "language";
26const META_SYMBOL_KIND: &str = "symbol_kind";
27const META_SYMBOL_NAME: &str = "name";
28const META_SPAN: &str = "span";
29const META_EXPORTED: &str = "exported";
30
31#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
32#[serde(rename_all = "snake_case")]
33pub enum CodeGraphSeverity {
34 Error,
35 Warning,
36 Info,
37}
38
39#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
40pub struct CodeGraphDiagnostic {
41 pub severity: CodeGraphSeverity,
42 pub code: String,
43 pub message: String,
44 #[serde(skip_serializing_if = "Option::is_none")]
45 pub path: Option<String>,
46 #[serde(skip_serializing_if = "Option::is_none")]
47 pub logical_key: Option<String>,
48}
49
50impl CodeGraphDiagnostic {
51 fn error(code: &str, message: impl Into<String>) -> Self {
52 Self {
53 severity: CodeGraphSeverity::Error,
54 code: code.to_string(),
55 message: message.into(),
56 path: None,
57 logical_key: None,
58 }
59 }
60
61 fn warning(code: &str, message: impl Into<String>) -> Self {
62 Self {
63 severity: CodeGraphSeverity::Warning,
64 code: code.to_string(),
65 message: message.into(),
66 path: None,
67 logical_key: None,
68 }
69 }
70
71 fn info(code: &str, message: impl Into<String>) -> Self {
72 Self {
73 severity: CodeGraphSeverity::Info,
74 code: code.to_string(),
75 message: message.into(),
76 path: None,
77 logical_key: None,
78 }
79 }
80
81 fn with_path(mut self, path: impl Into<String>) -> Self {
82 self.path = Some(path.into());
83 self
84 }
85
86 fn with_logical_key(mut self, logical_key: impl Into<String>) -> Self {
87 self.logical_key = Some(logical_key.into());
88 self
89 }
90}
91
92#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
93pub struct CodeGraphValidationResult {
94 pub valid: bool,
95 pub diagnostics: Vec<CodeGraphDiagnostic>,
96}
97
98#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
99pub struct CodeGraphStats {
100 pub total_nodes: usize,
101 pub repository_nodes: usize,
102 pub directory_nodes: usize,
103 pub file_nodes: usize,
104 pub symbol_nodes: usize,
105 pub total_edges: usize,
106 pub reference_edges: usize,
107 pub export_edges: usize,
108 #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
109 pub languages: BTreeMap<String, usize>,
110}
111
112#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
113#[serde(rename_all = "snake_case")]
114pub enum CodeGraphBuildStatus {
115 Success,
116 PartialSuccess,
117 FailedValidation,
118}
119
120#[derive(Debug, Clone)]
121pub struct CodeGraphBuildResult {
122 pub document: Document,
123 pub diagnostics: Vec<CodeGraphDiagnostic>,
124 pub stats: CodeGraphStats,
125 pub profile_version: String,
126 pub canonical_fingerprint: String,
127 pub status: CodeGraphBuildStatus,
128}
129
130impl CodeGraphBuildResult {
131 pub fn has_errors(&self) -> bool {
132 self.diagnostics
133 .iter()
134 .any(|d| d.severity == CodeGraphSeverity::Error)
135 }
136}
137
138#[derive(Debug, Clone, Serialize, Deserialize)]
139pub struct CodeGraphBuildInput {
140 pub repository_path: PathBuf,
141 pub commit_hash: String,
142 #[serde(default)]
143 pub config: CodeGraphExtractorConfig,
144}
145
146#[derive(Debug, Clone, Serialize, Deserialize)]
147pub struct CodeGraphExtractorConfig {
148 #[serde(default = "default_include_extensions")]
149 pub include_extensions: Vec<String>,
150 #[serde(default = "default_exclude_dirs")]
151 pub exclude_dirs: Vec<String>,
152 #[serde(default = "default_continue_on_parse_error")]
153 pub continue_on_parse_error: bool,
154 #[serde(default)]
155 pub include_hidden: bool,
156 #[serde(default = "default_max_file_bytes")]
157 pub max_file_bytes: usize,
158 #[serde(default = "default_emit_export_edges")]
159 pub emit_export_edges: bool,
160}
161
162impl Default for CodeGraphExtractorConfig {
163 fn default() -> Self {
164 Self {
165 include_extensions: default_include_extensions(),
166 exclude_dirs: default_exclude_dirs(),
167 continue_on_parse_error: default_continue_on_parse_error(),
168 include_hidden: false,
169 max_file_bytes: default_max_file_bytes(),
170 emit_export_edges: default_emit_export_edges(),
171 }
172 }
173}
174
175fn default_include_extensions() -> Vec<String> {
176 vec!["rs", "py", "ts", "tsx", "js", "jsx"]
177 .into_iter()
178 .map(|s| s.to_string())
179 .collect()
180}
181
182fn default_exclude_dirs() -> Vec<String> {
183 vec![".git", "target", "node_modules", "dist", "build"]
184 .into_iter()
185 .map(|s| s.to_string())
186 .collect()
187}
188
189fn default_continue_on_parse_error() -> bool {
190 true
191}
192
193fn default_max_file_bytes() -> usize {
194 2 * 1024 * 1024
195}
196
197fn default_emit_export_edges() -> bool {
198 true
199}
200
201#[derive(Debug, Clone, Serialize, Deserialize)]
202pub struct PortableDocument {
203 pub id: String,
204 pub root: String,
205 pub structure: BTreeMap<String, Vec<String>>,
206 pub blocks: BTreeMap<String, Block>,
207 pub metadata: DocumentMetadata,
208 pub version: u64,
209}
210
211impl PortableDocument {
212 pub fn from_document(doc: &Document) -> Self {
213 let mut structure = BTreeMap::new();
214 for (parent, children) in &doc.structure {
215 let mut sorted = children.clone();
216 sorted.sort_by_key(|id| id.to_string());
217 structure.insert(
218 parent.to_string(),
219 sorted.into_iter().map(|id| id.to_string()).collect(),
220 );
221 }
222
223 let mut blocks = BTreeMap::new();
224 for (id, block) in &doc.blocks {
225 blocks.insert(id.to_string(), block.clone());
226 }
227
228 Self {
229 id: doc.id.0.clone(),
230 root: doc.root.to_string(),
231 structure,
232 blocks,
233 metadata: doc.metadata.clone(),
234 version: doc.version.counter,
235 }
236 }
237
238 pub fn to_document(&self) -> Result<Document> {
239 let root = BlockId::from_str(&self.root)
240 .map_err(|_| anyhow!("invalid root block id: {}", self.root))?;
241
242 let mut structure: HashMap<BlockId, Vec<BlockId>> = HashMap::new();
243 for (parent, children) in &self.structure {
244 let parent_id = BlockId::from_str(parent)
245 .map_err(|_| anyhow!("invalid structure parent id: {}", parent))?;
246 let mut parsed_children = Vec::with_capacity(children.len());
247 for child in children {
248 let child_id = BlockId::from_str(child)
249 .map_err(|_| anyhow!("invalid structure child id: {}", child))?;
250 parsed_children.push(child_id);
251 }
252 structure.insert(parent_id, parsed_children);
253 }
254
255 let mut blocks: HashMap<BlockId, Block> = HashMap::new();
256 for (id, block) in &self.blocks {
257 let block_id = BlockId::from_str(id)
258 .map_err(|_| anyhow!("invalid block id in blocks map: {}", id))?;
259 blocks.insert(block_id, block.clone());
260 }
261
262 let mut doc = Document {
263 id: DocumentId::new(self.id.clone()),
264 root,
265 structure,
266 blocks,
267 metadata: self.metadata.clone(),
268 indices: Default::default(),
269 edge_index: Default::default(),
270 version: ucm_core::DocumentVersion {
271 counter: self.version,
272 timestamp: deterministic_timestamp(),
273 state_hash: [0u8; 8],
274 },
275 };
276 doc.rebuild_indices();
277 Ok(doc)
278 }
279}
280
281pub fn build_code_graph(input: &CodeGraphBuildInput) -> Result<CodeGraphBuildResult> {
282 let repo_root = input
283 .repository_path
284 .canonicalize()
285 .with_context(|| format!("failed to resolve repo path {:?}", input.repository_path))?;
286
287 if !repo_root.is_dir() {
288 return Err(anyhow!(
289 "repository path is not a directory: {}",
290 repo_root.display()
291 ));
292 }
293
294 let mut diagnostics = Vec::new();
295 let matcher = GitignoreMatcher::from_repository(&repo_root)?;
296 let files = collect_repository_files(&repo_root, &input.config, &matcher, &mut diagnostics)?;
297
298 let repo_name = repo_root
299 .file_name()
300 .map(|s| s.to_string_lossy().to_string())
301 .unwrap_or_else(|| "repository".to_string());
302
303 let mut doc = Document::new(DocumentId::new(format!(
304 "codegraph:{}:{}",
305 sanitize_identifier(&repo_name),
306 sanitize_identifier(&input.commit_hash)
307 )));
308
309 initialize_document_metadata(&mut doc, &repo_root, &repo_name, &input.commit_hash);
310
311 let repo_block = make_repository_block(&repo_name, &input.commit_hash);
312 let root_id = doc.root;
313 let repo_block_id = doc.add_block(repo_block, &root_id)?;
314
315 let mut directories = BTreeSet::new();
316 for file in &files {
317 for dir in ancestor_directories(&file.relative_path) {
318 directories.insert(dir);
319 }
320 }
321
322 let mut directory_ids: BTreeMap<String, BlockId> = BTreeMap::new();
323 for dir in directories {
324 let parent_id = parent_directory_id(&dir, &directory_ids).unwrap_or(repo_block_id);
325 let block = make_directory_block(&dir);
326 let block_id = doc.add_block(block, &parent_id)?;
327 directory_ids.insert(dir, block_id);
328 }
329
330 let mut file_ids: BTreeMap<String, BlockId> = BTreeMap::new();
331 let mut file_analyses = Vec::new();
332 let mut used_symbol_keys: HashSet<String> = HashSet::new();
333
334 for file in files {
335 let parent_id = parent_id_for_file(&file.relative_path, repo_block_id, &directory_ids);
336
337 let source = match fs::read_to_string(&file.absolute_path) {
338 Ok(s) => s,
339 Err(err) => {
340 let diag = CodeGraphDiagnostic::error(
341 "CG2003",
342 format!("failed to read source file: {}", err),
343 )
344 .with_path(file.relative_path.clone());
345 diagnostics.push(diag);
346 if input.config.continue_on_parse_error {
347 continue;
348 }
349 return Err(anyhow!(
350 "failed to read source file {}: {}",
351 file.relative_path,
352 err
353 ));
354 }
355 };
356
357 if source.len() > input.config.max_file_bytes {
358 diagnostics.push(
359 CodeGraphDiagnostic::warning(
360 "CG2008",
361 format!(
362 "file skipped due to size limit ({} bytes > {} bytes)",
363 source.len(),
364 input.config.max_file_bytes
365 ),
366 )
367 .with_path(file.relative_path.clone()),
368 );
369 continue;
370 }
371
372 let file_block = make_file_block(&file.relative_path, file.language.as_str());
373 let file_block_id = doc.add_block(file_block, &parent_id)?;
374 file_ids.insert(file.relative_path.clone(), file_block_id);
375
376 let analysis = analyze_file(&file.relative_path, &source, file.language);
377 for diag in &analysis.diagnostics {
378 diagnostics.push(diag.clone().with_path(file.relative_path.clone()));
379 }
380
381 for symbol in &analysis.symbols {
382 let logical_key = unique_symbol_logical_key(
383 &file.relative_path,
384 &symbol.name,
385 symbol.start_line,
386 &mut used_symbol_keys,
387 );
388 let symbol_block = make_symbol_block(
389 &logical_key,
390 &file.relative_path,
391 file.language.as_str(),
392 symbol,
393 );
394 let symbol_id = doc.add_block(symbol_block, &file_block_id)?;
395
396 if symbol.exported && input.config.emit_export_edges {
397 let mut edge = Edge::new(EdgeType::Custom("exports".to_string()), symbol_id);
398 edge.metadata
399 .custom
400 .insert("relation".to_string(), json!("exports"));
401 edge.metadata
402 .custom
403 .insert("symbol".to_string(), json!(symbol.name.clone()));
404 if let Some(source_block) = doc.get_block_mut(&file_block_id) {
405 source_block.edges.push(edge);
406 }
407 }
408 }
409
410 file_analyses.push(FileAnalysisRecord {
411 file: file.relative_path,
412 language: file.language,
413 imports: analysis.imports,
414 });
415 }
416
417 let known_files: BTreeSet<String> = file_ids.keys().cloned().collect();
418 let mut pending_reference_edges: BTreeSet<(String, String, String)> = BTreeSet::new();
419
420 for record in &file_analyses {
421 for import in &record.imports {
422 match resolve_import(&record.file, &record.language, &import.module, &known_files) {
423 Some(target) if target != record.file => {
424 pending_reference_edges.insert((
425 record.file.clone(),
426 target,
427 import.module.clone(),
428 ));
429 }
430 Some(_) => {}
431 None => {
432 diagnostics.push(
433 CodeGraphDiagnostic::warning(
434 "CG2006",
435 format!("unresolved import '{}'", import.module),
436 )
437 .with_path(record.file.clone()),
438 );
439 }
440 }
441 }
442 }
443
444 for (source_path, target_path, raw_import) in pending_reference_edges {
445 let (Some(source_id), Some(target_id)) =
446 (file_ids.get(&source_path), file_ids.get(&target_path))
447 else {
448 continue;
449 };
450 let mut edge = Edge::new(EdgeType::References, *target_id);
451 edge.metadata
452 .custom
453 .insert("relation".to_string(), json!("imports"));
454 edge.metadata
455 .custom
456 .insert("raw_import".to_string(), json!(raw_import));
457 if let Some(source_block) = doc.get_block_mut(source_id) {
458 source_block.edges.push(edge);
459 }
460 }
461
462 sort_structure_children_by_logical_key(&mut doc);
463 sort_edges(&mut doc);
464 normalize_temporal_fields(&mut doc);
465 doc.rebuild_indices();
466
467 let mut validation = validate_code_graph_profile(&doc);
468 diagnostics.append(&mut validation.diagnostics);
469
470 let fingerprint = canonical_fingerprint(&doc)?;
471 let stats = compute_stats(&doc);
472
473 let has_profile_errors = diagnostics
474 .iter()
475 .any(|d| d.severity == CodeGraphSeverity::Error && d.code.starts_with("CG100"));
476 let has_non_info = diagnostics
477 .iter()
478 .any(|d| d.severity != CodeGraphSeverity::Info);
479
480 let status = if has_profile_errors {
481 CodeGraphBuildStatus::FailedValidation
482 } else if has_non_info {
483 CodeGraphBuildStatus::PartialSuccess
484 } else {
485 CodeGraphBuildStatus::Success
486 };
487
488 Ok(CodeGraphBuildResult {
489 document: doc,
490 diagnostics,
491 stats,
492 profile_version: CODEGRAPH_PROFILE_MARKER.to_string(),
493 canonical_fingerprint: fingerprint,
494 status,
495 })
496}
497
498pub fn validate_code_graph_profile(doc: &Document) -> CodeGraphValidationResult {
499 let mut diagnostics = Vec::new();
500
501 match doc.metadata.custom.get("profile").and_then(|v| v.as_str()) {
502 Some(CODEGRAPH_PROFILE) => {}
503 Some(other) => diagnostics.push(CodeGraphDiagnostic::error(
504 "CG1001",
505 format!(
506 "invalid profile marker '{}', expected '{}'",
507 other, CODEGRAPH_PROFILE
508 ),
509 )),
510 None => diagnostics.push(CodeGraphDiagnostic::error(
511 "CG1001",
512 "missing document metadata.custom.profile marker",
513 )),
514 }
515
516 match doc
517 .metadata
518 .custom
519 .get("profile_version")
520 .and_then(|v| v.as_str())
521 {
522 Some(CODEGRAPH_PROFILE_VERSION) => {}
523 Some(other) => diagnostics.push(CodeGraphDiagnostic::error(
524 "CG1002",
525 format!(
526 "invalid profile version '{}', expected '{}'",
527 other, CODEGRAPH_PROFILE_VERSION
528 ),
529 )),
530 None => diagnostics.push(CodeGraphDiagnostic::error(
531 "CG1002",
532 "missing document metadata.custom.profile_version marker",
533 )),
534 }
535
536 let mut logical_keys: HashMap<String, Vec<BlockId>> = HashMap::new();
537 let mut class_counts: HashMap<String, usize> = HashMap::new();
538
539 for (id, block) in &doc.blocks {
540 if *id == doc.root {
541 continue;
542 }
543
544 let class = node_class(block);
545 let Some(class_name) = class else {
546 diagnostics.push(
547 CodeGraphDiagnostic::error(
548 "CG1010",
549 "block missing node_class metadata (or custom semantic role)",
550 )
551 .with_path(block_path(block).unwrap_or_else(|| id.to_string())),
552 );
553 continue;
554 };
555
556 *class_counts.entry(class_name.clone()).or_default() += 1;
557
558 match block_logical_key(block) {
559 Some(logical_key) => {
560 logical_keys.entry(logical_key).or_default().push(*id);
561 }
562 None => diagnostics.push(
563 CodeGraphDiagnostic::error("CG1011", "missing required logical_key metadata")
564 .with_path(block_path(block).unwrap_or_else(|| id.to_string())),
565 ),
566 }
567
568 validate_required_metadata(&class_name, block, &mut diagnostics);
569 }
570
571 for class in ["repository", "directory", "file", "symbol"] {
572 if class_counts.get(class).copied().unwrap_or(0) == 0 {
573 diagnostics.push(CodeGraphDiagnostic::warning(
574 "CG1012",
575 format!("profile has no '{}' nodes", class),
576 ));
577 }
578 }
579
580 for (logical_key, ids) in logical_keys {
581 if ids.len() > 1 {
582 diagnostics.push(
583 CodeGraphDiagnostic::error(
584 "CG1013",
585 format!(
586 "logical_key '{}' is duplicated by {} blocks",
587 logical_key,
588 ids.len()
589 ),
590 )
591 .with_logical_key(logical_key),
592 );
593 }
594 }
595
596 let logical_by_id = logical_key_index(doc);
597
598 for (source_id, block) in &doc.blocks {
599 let Some(source_class) = node_class(block) else {
600 continue;
601 };
602 for edge in &block.edges {
603 let target_block = match doc.get_block(&edge.target) {
604 Some(b) => b,
605 None => {
606 diagnostics.push(
607 CodeGraphDiagnostic::error(
608 "CG1014",
609 format!("edge references missing target block {}", edge.target),
610 )
611 .with_logical_key(
612 logical_by_id
613 .get(source_id)
614 .cloned()
615 .unwrap_or_else(|| source_id.to_string()),
616 ),
617 );
618 continue;
619 }
620 };
621
622 let target_class = node_class(target_block).unwrap_or_default();
623
624 match &edge.edge_type {
625 EdgeType::References => {
626 if source_class != "file" || target_class != "file" {
627 diagnostics.push(
628 CodeGraphDiagnostic::error(
629 "CG1015",
630 "references edges must connect file -> file",
631 )
632 .with_logical_key(
633 logical_by_id
634 .get(source_id)
635 .cloned()
636 .unwrap_or_else(|| source_id.to_string()),
637 ),
638 );
639 }
640 }
641 EdgeType::Custom(name) if name == "exports" => {
642 if source_class != "file" || target_class != "symbol" {
643 diagnostics.push(
644 CodeGraphDiagnostic::error(
645 "CG1016",
646 "exports edges must connect file -> symbol",
647 )
648 .with_logical_key(
649 logical_by_id
650 .get(source_id)
651 .cloned()
652 .unwrap_or_else(|| source_id.to_string()),
653 ),
654 );
655 }
656 }
657 _ => {}
658 }
659 }
660 }
661
662 CodeGraphValidationResult {
663 valid: diagnostics
664 .iter()
665 .all(|d| d.severity != CodeGraphSeverity::Error),
666 diagnostics,
667 }
668}
669
670pub fn canonical_codegraph_json(doc: &Document) -> Result<String> {
671 let logical_by_id = logical_key_index(doc);
672
673 let mut node_entries = Vec::new();
674 for (id, block) in &doc.blocks {
675 if *id == doc.root {
676 continue;
677 }
678
679 let logical_key = logical_by_id
680 .get(id)
681 .cloned()
682 .unwrap_or_else(|| id.to_string());
683
684 let class = node_class(block).unwrap_or_else(|| "unknown".to_string());
685 let metadata = normalized_block_metadata(block);
686
687 node_entries.push(json!({
688 "logical_key": logical_key,
689 "node_class": class,
690 "semantic_role": block.metadata.semantic_role.as_ref().map(|r| r.to_string()),
691 "content_type": block.content.type_tag(),
692 "content": normalize_content(&block.content),
693 "metadata": metadata,
694 }));
695 }
696
697 node_entries.sort_by(|a, b| {
698 let ak = a
699 .get("logical_key")
700 .and_then(|v| v.as_str())
701 .unwrap_or_default();
702 let bk = b
703 .get("logical_key")
704 .and_then(|v| v.as_str())
705 .unwrap_or_default();
706 ak.cmp(bk)
707 });
708
709 let mut structure_entries = Vec::new();
710 for (parent, children) in &doc.structure {
711 let parent_key = logical_by_id
712 .get(parent)
713 .cloned()
714 .unwrap_or_else(|| parent.to_string());
715
716 let mut child_keys: Vec<String> = children
717 .iter()
718 .map(|child| {
719 logical_by_id
720 .get(child)
721 .cloned()
722 .unwrap_or_else(|| child.to_string())
723 })
724 .collect();
725 child_keys.sort();
726
727 structure_entries.push(json!({
728 "parent": parent_key,
729 "children": child_keys,
730 }));
731 }
732
733 structure_entries.sort_by(|a, b| {
734 let ak = a.get("parent").and_then(|v| v.as_str()).unwrap_or_default();
735 let bk = b.get("parent").and_then(|v| v.as_str()).unwrap_or_default();
736 ak.cmp(bk)
737 });
738
739 let mut edge_entries = Vec::new();
740 for (source_id, block) in &doc.blocks {
741 let source_key = logical_by_id
742 .get(source_id)
743 .cloned()
744 .unwrap_or_else(|| source_id.to_string());
745
746 for edge in &block.edges {
747 let target_key = logical_by_id
748 .get(&edge.target)
749 .cloned()
750 .unwrap_or_else(|| edge.target.to_string());
751 edge_entries.push(json!({
752 "source": source_key,
753 "edge_type": edge.edge_type.as_str(),
754 "target": target_key,
755 "metadata": normalized_edge_metadata(edge),
756 }));
757 }
758 }
759
760 edge_entries.sort_by(|a, b| {
761 let a_source = a.get("source").and_then(|v| v.as_str()).unwrap_or_default();
762 let b_source = b.get("source").and_then(|v| v.as_str()).unwrap_or_default();
763 a_source
764 .cmp(b_source)
765 .then_with(|| {
766 a.get("edge_type")
767 .and_then(|v| v.as_str())
768 .unwrap_or_default()
769 .cmp(
770 b.get("edge_type")
771 .and_then(|v| v.as_str())
772 .unwrap_or_default(),
773 )
774 })
775 .then_with(|| {
776 a.get("target")
777 .and_then(|v| v.as_str())
778 .unwrap_or_default()
779 .cmp(b.get("target").and_then(|v| v.as_str()).unwrap_or_default())
780 })
781 });
782
783 let canonical = json!({
784 "profile": CODEGRAPH_PROFILE,
785 "profile_version": CODEGRAPH_PROFILE_VERSION,
786 "nodes": node_entries,
787 "structure": structure_entries,
788 "edges": edge_entries,
789 "document_metadata": normalized_document_metadata(doc),
790 });
791
792 Ok(canonical_json(&canonical))
793}
794
795pub fn canonical_fingerprint(doc: &Document) -> Result<String> {
796 let canonical = canonical_codegraph_json(doc)?;
797 let mut hasher = Sha256::new();
798 hasher.update(canonical.as_bytes());
799 let digest = hasher.finalize();
800 Ok(hex::encode(digest))
801}
802
803pub fn codegraph_prompt_projection(doc: &Document) -> String {
804 let mapper = IdMapper::from_document(doc);
805 mapper.document_to_prompt(doc)
806}
807
808#[derive(Debug, Clone, Copy, PartialEq, Eq)]
809enum CodeLanguage {
810 Rust,
811 Python,
812 TypeScript,
813 JavaScript,
814}
815
816impl CodeLanguage {
817 fn as_str(self) -> &'static str {
818 match self {
819 Self::Rust => "rust",
820 Self::Python => "python",
821 Self::TypeScript => "typescript",
822 Self::JavaScript => "javascript",
823 }
824 }
825}
826
827#[derive(Debug, Clone)]
828struct RepoFile {
829 absolute_path: PathBuf,
830 relative_path: String,
831 language: CodeLanguage,
832}
833
834#[derive(Debug, Clone)]
835struct ExtractedSymbol {
836 name: String,
837 kind: String,
838 exported: bool,
839 start_line: usize,
840 start_col: usize,
841 end_line: usize,
842 end_col: usize,
843}
844
845#[derive(Debug, Clone)]
846struct ExtractedImport {
847 module: String,
848}
849
850#[derive(Debug, Clone, Default)]
851struct FileAnalysis {
852 symbols: Vec<ExtractedSymbol>,
853 imports: Vec<ExtractedImport>,
854 diagnostics: Vec<CodeGraphDiagnostic>,
855}
856
857#[derive(Debug, Clone)]
858struct FileAnalysisRecord {
859 file: String,
860 language: CodeLanguage,
861 imports: Vec<ExtractedImport>,
862}
863
864fn initialize_document_metadata(
865 doc: &mut Document,
866 repo_root: &Path,
867 repo_name: &str,
868 commit: &str,
869) {
870 doc.metadata.title = Some(format!("CodeGraph: {}", repo_name));
871 doc.metadata.description = Some("CodeGraphProfile v1 document".to_string());
872 doc.metadata.language = Some("multi".to_string());
873 doc.metadata
874 .custom
875 .insert("profile".to_string(), json!(CODEGRAPH_PROFILE));
876 doc.metadata.custom.insert(
877 "profile_version".to_string(),
878 json!(CODEGRAPH_PROFILE_VERSION),
879 );
880 doc.metadata.custom.insert(
881 "profile_marker".to_string(),
882 json!(CODEGRAPH_PROFILE_MARKER),
883 );
884 doc.metadata.custom.insert(
885 "extractor_version".to_string(),
886 json!(CODEGRAPH_EXTRACTOR_VERSION),
887 );
888 doc.metadata
889 .custom
890 .insert("commit_hash".to_string(), json!(commit));
891 doc.metadata.custom.insert(
892 "repository_path".to_string(),
893 json!(normalize_path(repo_root)),
894 );
895}
896
897fn make_repository_block(repo_name: &str, commit_hash: &str) -> Block {
898 let mut block = Block::new(
899 Content::json(json!({
900 "name": repo_name,
901 "commit": commit_hash,
902 })),
903 Some("custom.repository"),
904 );
905 block.metadata.label = Some(repo_name.to_string());
906 block
907 .metadata
908 .custom
909 .insert(META_NODE_CLASS.to_string(), json!("repository"));
910 block.metadata.custom.insert(
911 META_LOGICAL_KEY.to_string(),
912 json!(format!("repository:{}", repo_name)),
913 );
914 block
915}
916
917fn make_directory_block(path: &str) -> Block {
918 let mut block = Block::new(
919 Content::json(json!({
920 "path": path,
921 })),
922 Some("custom.directory"),
923 );
924 block.metadata.label = Some(path.to_string());
925 block
926 .metadata
927 .custom
928 .insert(META_NODE_CLASS.to_string(), json!("directory"));
929 block
930 .metadata
931 .custom
932 .insert(META_PATH.to_string(), json!(path));
933 block.metadata.custom.insert(
934 META_LOGICAL_KEY.to_string(),
935 json!(format!("directory:{}", path)),
936 );
937 block
938}
939
940fn make_file_block(path: &str, language: &str) -> Block {
941 let mut block = Block::new(
942 Content::json(json!({
943 "path": path,
944 "language": language,
945 })),
946 Some("custom.file"),
947 );
948 block.metadata.label = Some(path.to_string());
949 block
950 .metadata
951 .custom
952 .insert(META_NODE_CLASS.to_string(), json!("file"));
953 block
954 .metadata
955 .custom
956 .insert(META_PATH.to_string(), json!(path));
957 block
958 .metadata
959 .custom
960 .insert(META_LANGUAGE.to_string(), json!(language));
961 block.metadata.custom.insert(
962 META_LOGICAL_KEY.to_string(),
963 json!(format!("file:{}", path)),
964 );
965 block
966}
967
968fn make_symbol_block(
969 logical_key: &str,
970 path: &str,
971 language: &str,
972 symbol: &ExtractedSymbol,
973) -> Block {
974 let span = json!({
975 "start_line": symbol.start_line,
976 "start_col": symbol.start_col,
977 "end_line": symbol.end_line,
978 "end_col": symbol.end_col,
979 });
980
981 let mut block = Block::new(
982 Content::json(json!({
983 "name": symbol.name,
984 "kind": symbol.kind,
985 "path": path,
986 "span": span,
987 "exported": symbol.exported,
988 })),
989 Some("custom.symbol"),
990 );
991
992 block.metadata.label = Some(symbol.name.clone());
993 block
994 .metadata
995 .custom
996 .insert(META_NODE_CLASS.to_string(), json!("symbol"));
997 block
998 .metadata
999 .custom
1000 .insert(META_LOGICAL_KEY.to_string(), json!(logical_key));
1001 block
1002 .metadata
1003 .custom
1004 .insert(META_PATH.to_string(), json!(path));
1005 block
1006 .metadata
1007 .custom
1008 .insert(META_LANGUAGE.to_string(), json!(language));
1009 block
1010 .metadata
1011 .custom
1012 .insert(META_SYMBOL_KIND.to_string(), json!(symbol.kind));
1013 block
1014 .metadata
1015 .custom
1016 .insert(META_SYMBOL_NAME.to_string(), json!(symbol.name));
1017 block.metadata.custom.insert(META_SPAN.to_string(), span);
1018 block
1019 .metadata
1020 .custom
1021 .insert(META_EXPORTED.to_string(), json!(symbol.exported));
1022 block
1023}
1024
1025fn analyze_file(path: &str, source: &str, language: CodeLanguage) -> FileAnalysis {
1026 let mut analysis = FileAnalysis::default();
1027 let mut parser = Parser::new();
1028 if parser.set_language(language_for(language)).is_err() {
1029 analysis.diagnostics.push(
1030 CodeGraphDiagnostic::error(
1031 "CG2010",
1032 format!(
1033 "failed to initialize tree-sitter parser for {}",
1034 language.as_str()
1035 ),
1036 )
1037 .with_path(path.to_string()),
1038 );
1039 return analysis;
1040 }
1041
1042 let Some(tree) = parser.parse(source, None) else {
1043 analysis.diagnostics.push(
1044 CodeGraphDiagnostic::error("CG2011", "tree-sitter returned no parse tree")
1045 .with_path(path.to_string()),
1046 );
1047 return analysis;
1048 };
1049
1050 let root = tree.root_node();
1051 if root.has_error() {
1052 analysis.diagnostics.push(
1053 CodeGraphDiagnostic::warning(
1054 "CG2002",
1055 "tree-sitter parser reported syntax errors; extraction continues",
1056 )
1057 .with_path(path.to_string()),
1058 );
1059 }
1060
1061 match language {
1062 CodeLanguage::Rust => analyze_rust_tree(source, root, &mut analysis),
1063 CodeLanguage::Python => analyze_python_tree(source, root, &mut analysis),
1064 CodeLanguage::TypeScript | CodeLanguage::JavaScript => {
1065 analyze_ts_tree(source, root, &mut analysis)
1066 }
1067 }
1068
1069 if analysis.symbols.is_empty() {
1070 analysis.diagnostics.push(
1071 CodeGraphDiagnostic::info(
1072 "CG2001",
1073 format!("no top-level symbols extracted for {}", path),
1074 )
1075 .with_path(path.to_string()),
1076 );
1077 }
1078
1079 analysis
1080}
1081
1082fn language_for(language: CodeLanguage) -> Language {
1083 match language {
1084 CodeLanguage::Rust => tree_sitter_rust::language(),
1085 CodeLanguage::Python => tree_sitter_python::language(),
1086 CodeLanguage::TypeScript => tree_sitter_typescript::language_typescript(),
1087 CodeLanguage::JavaScript => tree_sitter_javascript::language(),
1088 }
1089}
1090
1091fn analyze_rust_tree(source: &str, root: Node<'_>, analysis: &mut FileAnalysis) {
1092 let mut cursor = root.walk();
1093 for node in root.named_children(&mut cursor) {
1094 match node.kind() {
1095 "use_declaration" => {
1096 let import_text = node_text(source, node)
1097 .trim()
1098 .trim_start_matches("pub ")
1099 .trim_start_matches("use ")
1100 .trim_end_matches(';')
1101 .trim()
1102 .to_string();
1103 if !import_text.is_empty() {
1104 analysis.imports.push(ExtractedImport {
1105 module: import_text,
1106 });
1107 }
1108 }
1109 "mod_item" => {
1110 let text = node_text(source, node);
1111 if text.trim().ends_with(';') {
1112 if let Some(name) = rust_symbol_name(node, source) {
1113 analysis.imports.push(ExtractedImport {
1114 module: format!("mod:{}", name),
1115 });
1116 }
1117 }
1118 if let Some(symbol) = rust_symbol_from_node(node, source) {
1119 analysis.symbols.push(symbol);
1120 }
1121 }
1122 "function_item" | "struct_item" | "enum_item" | "trait_item" | "impl_item"
1123 | "type_item" | "const_item" => {
1124 if let Some(symbol) = rust_symbol_from_node(node, source) {
1125 analysis.symbols.push(symbol);
1126 }
1127 }
1128 _ => {}
1129 }
1130 }
1131}
1132
1133fn rust_symbol_from_node(node: Node<'_>, source: &str) -> Option<ExtractedSymbol> {
1134 let kind = match node.kind() {
1135 "function_item" => "function",
1136 "struct_item" => "struct",
1137 "enum_item" => "enum",
1138 "trait_item" => "trait",
1139 "impl_item" => "impl",
1140 "type_item" => "type",
1141 "const_item" => "const",
1142 "mod_item" => "module",
1143 _ => return None,
1144 };
1145
1146 let name = rust_symbol_name(node, source)?;
1147 let exported = node_text(source, node).trim_start().starts_with("pub");
1148 let (start_line, start_col, end_line, end_col) = node_span(node);
1149
1150 Some(ExtractedSymbol {
1151 name,
1152 kind: kind.to_string(),
1153 exported,
1154 start_line,
1155 start_col,
1156 end_line,
1157 end_col,
1158 })
1159}
1160
1161fn rust_symbol_name(node: Node<'_>, source: &str) -> Option<String> {
1162 if let Some(name_node) = node.child_by_field_name("name") {
1163 let name = node_text(source, name_node).trim().to_string();
1164 if !name.is_empty() {
1165 return Some(name);
1166 }
1167 }
1168
1169 if node.kind() == "impl_item" {
1170 if let Some(type_node) = node.child_by_field_name("type") {
1171 let name = node_text(source, type_node).trim().to_string();
1172 if !name.is_empty() {
1173 return Some(name);
1174 }
1175 }
1176 }
1177
1178 first_named_identifier(node, source)
1179}
1180
1181fn analyze_python_tree(source: &str, root: Node<'_>, analysis: &mut FileAnalysis) {
1182 let mut cursor = root.walk();
1183 for node in root.named_children(&mut cursor) {
1184 match node.kind() {
1185 "import_statement" => {
1186 let text = node_text(source, node).trim().to_string();
1187 if let Some(list) = text.strip_prefix("import ") {
1188 for item in list.split(',') {
1189 let name = item.split_whitespace().next().unwrap_or("").trim();
1190 if !name.is_empty() {
1191 analysis.imports.push(ExtractedImport {
1192 module: name.to_string(),
1193 });
1194 }
1195 }
1196 }
1197 }
1198 "import_from_statement" => {
1199 let text = node_text(source, node).trim().to_string();
1200 if let Some(rest) = text.strip_prefix("from ") {
1201 if let Some((module, _)) = rest.split_once(" import ") {
1202 let module_name = module.trim();
1203 if !module_name.is_empty() {
1204 analysis.imports.push(ExtractedImport {
1205 module: module_name.to_string(),
1206 });
1207 }
1208 }
1209 }
1210 }
1211 "function_definition" | "class_definition" => {
1212 let Some(name_node) = node.child_by_field_name("name") else {
1213 continue;
1214 };
1215 let name = node_text(source, name_node).trim().to_string();
1216 if name.is_empty() {
1217 continue;
1218 }
1219 let (start_line, start_col, end_line, end_col) = node_span(node);
1220 analysis.symbols.push(ExtractedSymbol {
1221 name: name.clone(),
1222 kind: if node.kind() == "class_definition" {
1223 "class".to_string()
1224 } else {
1225 "function".to_string()
1226 },
1227 exported: !name.starts_with('_'),
1228 start_line,
1229 start_col,
1230 end_line,
1231 end_col,
1232 });
1233 }
1234 _ => {}
1235 }
1236 }
1237}
1238
1239fn analyze_ts_tree(source: &str, root: Node<'_>, analysis: &mut FileAnalysis) {
1240 let mut cursor = root.walk();
1241 for node in root.named_children(&mut cursor) {
1242 match node.kind() {
1243 "import_statement" => {
1244 if let Some(module) = extract_ts_module_from_text(node_text(source, node)) {
1245 analysis.imports.push(ExtractedImport { module });
1246 }
1247 }
1248 "export_statement" => {
1249 if let Some(module) = extract_ts_module_from_text(node_text(source, node)) {
1250 analysis.imports.push(ExtractedImport { module });
1251 }
1252 analysis
1253 .symbols
1254 .extend(ts_symbols_from_export_statement(node, source));
1255 }
1256 "function_declaration"
1257 | "class_declaration"
1258 | "interface_declaration"
1259 | "type_alias_declaration"
1260 | "enum_declaration"
1261 | "module" => {
1262 if let Some(symbol) = ts_symbol_from_declaration(node, source, false) {
1263 analysis.symbols.push(symbol);
1264 }
1265 }
1266 "lexical_declaration" | "variable_statement" => {
1267 analysis
1268 .symbols
1269 .extend(ts_variable_symbols(node, source, false));
1270 }
1271 _ => {}
1272 }
1273 }
1274}
1275
1276fn ts_symbols_from_export_statement(node: Node<'_>, source: &str) -> Vec<ExtractedSymbol> {
1277 let mut out = Vec::new();
1278 let mut cursor = node.walk();
1279 for child in node.named_children(&mut cursor) {
1280 match child.kind() {
1281 "function_declaration"
1282 | "class_declaration"
1283 | "interface_declaration"
1284 | "type_alias_declaration"
1285 | "enum_declaration"
1286 | "module" => {
1287 if let Some(symbol) = ts_symbol_from_declaration(child, source, true) {
1288 out.push(symbol);
1289 }
1290 }
1291 "lexical_declaration" | "variable_statement" => {
1292 out.extend(ts_variable_symbols(child, source, true));
1293 }
1294 _ => {}
1295 }
1296 }
1297 out
1298}
1299
1300fn ts_symbol_from_declaration(
1301 node: Node<'_>,
1302 source: &str,
1303 exported_hint: bool,
1304) -> Option<ExtractedSymbol> {
1305 let kind = match node.kind() {
1306 "function_declaration" => "function",
1307 "class_declaration" => "class",
1308 "interface_declaration" => "interface",
1309 "type_alias_declaration" => "type",
1310 "enum_declaration" => "enum",
1311 "module" => "namespace",
1312 _ => return None,
1313 };
1314
1315 let name = node
1316 .child_by_field_name("name")
1317 .map(|n| node_text(source, n).trim().to_string())
1318 .or_else(|| first_named_identifier(node, source))?;
1319 if name.is_empty() {
1320 return None;
1321 }
1322 let exported = exported_hint || node_text(source, node).trim_start().starts_with("export ");
1323 let (start_line, start_col, end_line, end_col) = node_span(node);
1324
1325 Some(ExtractedSymbol {
1326 name,
1327 kind: kind.to_string(),
1328 exported,
1329 start_line,
1330 start_col,
1331 end_line,
1332 end_col,
1333 })
1334}
1335
1336fn ts_variable_symbols(node: Node<'_>, source: &str, exported_hint: bool) -> Vec<ExtractedSymbol> {
1337 let mut out = Vec::new();
1338 let exported = exported_hint || node_text(source, node).trim_start().starts_with("export ");
1339
1340 let mut stack = vec![node];
1341 while let Some(current) = stack.pop() {
1342 if current.kind() == "variable_declarator" {
1343 if let Some(name_node) = current.child_by_field_name("name") {
1344 let name = node_text(source, name_node).trim().to_string();
1345 if !name.is_empty() {
1346 let (start_line, start_col, end_line, end_col) = node_span(current);
1347 out.push(ExtractedSymbol {
1348 name,
1349 kind: "variable".to_string(),
1350 exported,
1351 start_line,
1352 start_col,
1353 end_line,
1354 end_col,
1355 });
1356 }
1357 }
1358 continue;
1359 }
1360
1361 let mut cursor = current.walk();
1362 for child in current.named_children(&mut cursor) {
1363 stack.push(child);
1364 }
1365 }
1366
1367 out
1368}
1369
1370fn extract_ts_module_from_text(text: &str) -> Option<String> {
1371 let patterns = [
1372 Regex::new(r#"(?i)\bfrom\s+['"]([^'"]+)['"]"#).ok()?,
1373 Regex::new(r#"(?i)\bimport\s+['"]([^'"]+)['"]"#).ok()?,
1374 Regex::new(r#"(?i)require\(\s*['"]([^'"]+)['"]\s*\)"#).ok()?,
1375 ];
1376 for pattern in patterns {
1377 if let Some(caps) = pattern.captures(text) {
1378 if let Some(module) = caps.get(1).map(|m| m.as_str().trim()) {
1379 if !module.is_empty() {
1380 return Some(module.to_string());
1381 }
1382 }
1383 }
1384 }
1385 None
1386}
1387
1388fn node_text<'a>(source: &'a str, node: Node<'_>) -> &'a str {
1389 let start = node.start_byte().min(source.len());
1390 let end = node.end_byte().min(source.len());
1391 &source[start..end]
1392}
1393
1394fn node_span(node: Node<'_>) -> (usize, usize, usize, usize) {
1395 let start = node.start_position();
1396 let end = node.end_position();
1397 (start.row + 1, start.column + 1, end.row + 1, end.column + 1)
1398}
1399
1400fn first_named_identifier(node: Node<'_>, source: &str) -> Option<String> {
1401 let mut stack = vec![node];
1402 while let Some(current) = stack.pop() {
1403 if matches!(current.kind(), "identifier" | "type_identifier") {
1404 let text = node_text(source, current).trim().to_string();
1405 if !text.is_empty() {
1406 return Some(text);
1407 }
1408 }
1409
1410 let mut cursor = current.walk();
1411 for child in current.named_children(&mut cursor) {
1412 stack.push(child);
1413 }
1414 }
1415 None
1416}
1417
1418fn resolve_import(
1419 source_file: &str,
1420 language: &CodeLanguage,
1421 module: &str,
1422 known_files: &BTreeSet<String>,
1423) -> Option<String> {
1424 match language {
1425 CodeLanguage::Rust => resolve_rust_import(source_file, module, known_files),
1426 CodeLanguage::Python => resolve_python_import(source_file, module, known_files),
1427 CodeLanguage::TypeScript | CodeLanguage::JavaScript => {
1428 resolve_ts_import(source_file, module, known_files)
1429 }
1430 }
1431}
1432
1433fn resolve_ts_import(
1434 source_file: &str,
1435 module: &str,
1436 known_files: &BTreeSet<String>,
1437) -> Option<String> {
1438 if !module.starts_with('.') {
1439 return None;
1440 }
1441
1442 let source_dir = parent_directory(source_file);
1443 let joined = normalize_relative_join(&source_dir, module);
1444
1445 ts_candidates(&joined)
1446 .into_iter()
1447 .find(|candidate| known_files.contains(candidate))
1448}
1449
1450fn ts_candidates(base: &str) -> Vec<String> {
1451 let exts = ["ts", "tsx", "js", "jsx"];
1452 let mut out = Vec::new();
1453
1454 if has_known_extension(base, &exts) {
1455 out.push(base.to_string());
1456 } else {
1457 for ext in exts {
1458 out.push(format!("{}.{}", base, ext));
1459 }
1460 for ext in exts {
1461 out.push(format!("{}/index.{}", base, ext));
1462 }
1463 }
1464
1465 out
1466}
1467
1468fn resolve_python_import(
1469 source_file: &str,
1470 module: &str,
1471 known_files: &BTreeSet<String>,
1472) -> Option<String> {
1473 let source_dir = parent_directory(source_file);
1474 let mut dots = 0usize;
1475 for ch in module.chars() {
1476 if ch == '.' {
1477 dots += 1;
1478 } else {
1479 break;
1480 }
1481 }
1482
1483 let module_tail = module.trim_start_matches('.');
1484
1485 let base_dir = if dots > 0 {
1486 ascend_directory(&source_dir, dots.saturating_sub(1))
1487 } else {
1488 String::new()
1489 };
1490
1491 let module_path = module_tail.replace('.', "/");
1492
1493 let joined = if base_dir.is_empty() {
1494 module_path
1495 } else if module_path.is_empty() {
1496 base_dir
1497 } else {
1498 format!("{}/{}", base_dir, module_path)
1499 };
1500
1501 py_candidates(&joined)
1502 .into_iter()
1503 .find(|candidate| known_files.contains(candidate))
1504}
1505
1506fn py_candidates(base: &str) -> Vec<String> {
1507 if base.is_empty() {
1508 return Vec::new();
1509 }
1510
1511 if base.ends_with(".py") {
1512 return vec![base.to_string()];
1513 }
1514
1515 vec![format!("{}.py", base), format!("{}/__init__.py", base)]
1516}
1517
1518fn resolve_rust_import(
1519 source_file: &str,
1520 module: &str,
1521 known_files: &BTreeSet<String>,
1522) -> Option<String> {
1523 if module.starts_with("std::") || module.starts_with("core::") || module.starts_with("alloc::")
1524 {
1525 return None;
1526 }
1527
1528 if let Some(name) = module.strip_prefix("mod:") {
1529 let source_dir = parent_directory(source_file);
1530 let local = normalize_relative_join(&source_dir, name);
1531 for candidate in [format!("{}.rs", local), format!("{}/mod.rs", local)] {
1532 if known_files.contains(&candidate) {
1533 return Some(candidate);
1534 }
1535 }
1536 return None;
1537 }
1538
1539 let source_dir = parent_directory(source_file);
1540
1541 let (base_dir, path_segments) = if let Some(rest) = module.strip_prefix("crate::") {
1542 (
1543 "src".to_string(),
1544 rest.split("::").map(|s| s.to_string()).collect::<Vec<_>>(),
1545 )
1546 } else if let Some(rest) = module.strip_prefix("self::") {
1547 (
1548 source_dir.clone(),
1549 rest.split("::").map(|s| s.to_string()).collect::<Vec<_>>(),
1550 )
1551 } else if module.starts_with("super::") {
1552 let mut rest = module;
1553 let mut super_count = 0usize;
1554 while let Some(next) = rest.strip_prefix("super::") {
1555 super_count += 1;
1556 rest = next;
1557 }
1558 (
1559 ascend_directory(&source_dir, super_count),
1560 rest.split("::").map(|s| s.to_string()).collect::<Vec<_>>(),
1561 )
1562 } else {
1563 (
1564 "src".to_string(),
1565 module
1566 .split("::")
1567 .map(|s| s.to_string())
1568 .collect::<Vec<_>>(),
1569 )
1570 };
1571
1572 for trimmed in (1..=path_segments.len()).rev() {
1573 let joined = path_segments[..trimmed].join("/");
1574 if joined.is_empty() {
1575 continue;
1576 }
1577 let candidate_base = if base_dir.is_empty() {
1578 joined
1579 } else {
1580 format!("{}/{}", base_dir, joined)
1581 };
1582
1583 for candidate in [
1584 format!("{}.rs", candidate_base),
1585 format!("{}/mod.rs", candidate_base),
1586 ] {
1587 if known_files.contains(&candidate) {
1588 return Some(candidate);
1589 }
1590 }
1591 }
1592
1593 None
1594}
1595
1596fn has_known_extension(path: &str, exts: &[&str]) -> bool {
1597 exts.iter().any(|ext| path.ends_with(&format!(".{}", ext)))
1598}
1599
1600fn normalize_temporal_fields(doc: &mut Document) {
1601 let ts = deterministic_timestamp();
1602 doc.metadata.created_at = ts;
1603 doc.metadata.modified_at = ts;
1604 doc.version.timestamp = ts;
1605
1606 for block in doc.blocks.values_mut() {
1607 block.metadata.created_at = ts;
1608 block.metadata.modified_at = ts;
1609 block.version.timestamp = ts;
1610
1611 for edge in &mut block.edges {
1612 edge.created_at = ts;
1613 }
1614 }
1615}
1616
1617fn deterministic_timestamp() -> chrono::DateTime<chrono::Utc> {
1618 chrono::DateTime::parse_from_rfc3339("1970-01-01T00:00:00Z")
1619 .unwrap()
1620 .with_timezone(&chrono::Utc)
1621}
1622
1623fn sort_structure_children_by_logical_key(doc: &mut Document) {
1624 let key_index = logical_key_index(doc);
1625
1626 for children in doc.structure.values_mut() {
1627 children.sort_by(|a, b| {
1628 let ka = key_index.get(a).cloned().unwrap_or_else(|| a.to_string());
1629 let kb = key_index.get(b).cloned().unwrap_or_else(|| b.to_string());
1630 ka.cmp(&kb)
1631 });
1632 }
1633}
1634
1635fn sort_edges(doc: &mut Document) {
1636 let key_index = logical_key_index(doc);
1637
1638 for block in doc.blocks.values_mut() {
1639 block.edges.sort_by(|a, b| {
1640 let at = key_index
1641 .get(&a.target)
1642 .cloned()
1643 .unwrap_or_else(|| a.target.to_string());
1644 let bt = key_index
1645 .get(&b.target)
1646 .cloned()
1647 .unwrap_or_else(|| b.target.to_string());
1648
1649 a.edge_type
1650 .as_str()
1651 .cmp(&b.edge_type.as_str())
1652 .then_with(|| at.cmp(&bt))
1653 });
1654 }
1655}
1656
1657fn compute_stats(doc: &Document) -> CodeGraphStats {
1658 let mut stats = CodeGraphStats::default();
1659
1660 for (id, block) in &doc.blocks {
1661 if *id == doc.root {
1662 continue;
1663 }
1664
1665 stats.total_nodes += 1;
1666
1667 match node_class(block).as_deref() {
1668 Some("repository") => stats.repository_nodes += 1,
1669 Some("directory") => stats.directory_nodes += 1,
1670 Some("file") => {
1671 stats.file_nodes += 1;
1672 if let Some(lang) = block
1673 .metadata
1674 .custom
1675 .get(META_LANGUAGE)
1676 .and_then(|v| v.as_str())
1677 {
1678 *stats.languages.entry(lang.to_string()).or_default() += 1;
1679 }
1680 }
1681 Some("symbol") => stats.symbol_nodes += 1,
1682 _ => {}
1683 }
1684
1685 for edge in &block.edges {
1686 stats.total_edges += 1;
1687 match &edge.edge_type {
1688 EdgeType::References => stats.reference_edges += 1,
1689 EdgeType::Custom(name) if name == "exports" => stats.export_edges += 1,
1690 _ => {}
1691 }
1692 }
1693 }
1694
1695 stats
1696}
1697
1698fn block_logical_key(block: &Block) -> Option<String> {
1699 block
1700 .metadata
1701 .custom
1702 .get(META_LOGICAL_KEY)
1703 .and_then(|v| v.as_str())
1704 .map(|s| s.to_string())
1705}
1706
1707fn block_path(block: &Block) -> Option<String> {
1708 block
1709 .metadata
1710 .custom
1711 .get(META_PATH)
1712 .and_then(|v| v.as_str())
1713 .map(|s| s.to_string())
1714}
1715
1716fn node_class(block: &Block) -> Option<String> {
1717 if let Some(class) = block
1718 .metadata
1719 .custom
1720 .get(META_NODE_CLASS)
1721 .and_then(|v| v.as_str())
1722 {
1723 return Some(class.to_string());
1724 }
1725
1726 if let Some(role) = &block.metadata.semantic_role {
1727 if role.category == ucm_core::RoleCategory::Custom {
1728 if let Some(sub) = &role.subcategory {
1729 return Some(sub.to_string());
1730 }
1731 }
1732 }
1733
1734 None
1735}
1736
1737fn validate_required_metadata(
1738 class_name: &str,
1739 block: &Block,
1740 diagnostics: &mut Vec<CodeGraphDiagnostic>,
1741) {
1742 let required = match class_name {
1743 "repository" => vec![META_LOGICAL_KEY],
1744 "directory" => vec![META_LOGICAL_KEY, META_PATH],
1745 "file" => vec![META_LOGICAL_KEY, META_PATH, META_LANGUAGE],
1746 "symbol" => vec![
1747 META_LOGICAL_KEY,
1748 META_PATH,
1749 META_LANGUAGE,
1750 META_SYMBOL_KIND,
1751 META_SYMBOL_NAME,
1752 META_SPAN,
1753 META_EXPORTED,
1754 ],
1755 _ => {
1756 diagnostics.push(CodeGraphDiagnostic::error(
1757 "CG1017",
1758 format!("invalid node_class '{}'", class_name),
1759 ));
1760 return;
1761 }
1762 };
1763
1764 for key in required {
1765 if !block.metadata.custom.contains_key(key) {
1766 diagnostics.push(
1767 CodeGraphDiagnostic::error(
1768 "CG1018",
1769 format!(
1770 "node class '{}' missing required metadata key '{}'",
1771 class_name, key
1772 ),
1773 )
1774 .with_logical_key(block_logical_key(block).unwrap_or_else(|| block.id.to_string())),
1775 );
1776 }
1777 }
1778
1779 if let Some(logical_key) = block_logical_key(block) {
1780 let expected_prefix = match class_name {
1781 "repository" => "repository:",
1782 "directory" => "directory:",
1783 "file" => "file:",
1784 "symbol" => "symbol:",
1785 _ => "",
1786 };
1787
1788 if !expected_prefix.is_empty() && !logical_key.starts_with(expected_prefix) {
1789 diagnostics.push(
1790 CodeGraphDiagnostic::error(
1791 "CG1019",
1792 format!(
1793 "logical_key '{}' must start with '{}'",
1794 logical_key, expected_prefix
1795 ),
1796 )
1797 .with_logical_key(logical_key),
1798 );
1799 }
1800 }
1801}
1802
1803fn logical_key_index(doc: &Document) -> HashMap<BlockId, String> {
1804 doc.blocks
1805 .iter()
1806 .map(|(id, block)| {
1807 (
1808 *id,
1809 block_logical_key(block).unwrap_or_else(|| id.to_string()),
1810 )
1811 })
1812 .collect()
1813}
1814
1815fn normalized_document_metadata(doc: &Document) -> serde_json::Value {
1816 let mut custom = serde_json::Map::new();
1817 let mut custom_entries: Vec<_> = doc.metadata.custom.iter().collect();
1818 custom_entries.sort_by(|a, b| a.0.cmp(b.0));
1819 for (k, v) in custom_entries {
1820 if is_volatile_metadata_key(k) {
1821 continue;
1822 }
1823 custom.insert(k.clone(), v.clone());
1824 }
1825
1826 json!({
1827 "title": doc.metadata.title,
1828 "description": doc.metadata.description,
1829 "authors": doc.metadata.authors,
1830 "language": doc.metadata.language,
1831 "custom": custom,
1832 })
1833}
1834
1835fn normalized_block_metadata(block: &Block) -> serde_json::Value {
1836 let mut custom = serde_json::Map::new();
1837 let mut entries: Vec<_> = block.metadata.custom.iter().collect();
1838 entries.sort_by(|a, b| a.0.cmp(b.0));
1839 for (k, v) in entries {
1840 if is_volatile_metadata_key(k) {
1841 continue;
1842 }
1843 custom.insert(k.clone(), v.clone());
1844 }
1845
1846 json!({
1847 "label": block.metadata.label,
1848 "semantic_role": block.metadata.semantic_role.as_ref().map(|r| r.to_string()),
1849 "tags": block.metadata.tags,
1850 "summary": block.metadata.summary,
1851 "custom": custom,
1852 })
1853}
1854
1855fn normalized_edge_metadata(edge: &Edge) -> serde_json::Value {
1856 let mut custom = serde_json::Map::new();
1857 let mut entries: Vec<_> = edge.metadata.custom.iter().collect();
1858 entries.sort_by(|a, b| a.0.cmp(b.0));
1859 for (k, v) in entries {
1860 if is_volatile_metadata_key(k) {
1861 continue;
1862 }
1863 custom.insert(k.clone(), v.clone());
1864 }
1865
1866 json!({
1867 "confidence": edge.metadata.confidence,
1868 "description": edge.metadata.description,
1869 "custom": custom,
1870 })
1871}
1872
1873fn is_volatile_metadata_key(key: &str) -> bool {
1874 matches!(key, "generated_at" | "runtime" | "session" | "timestamp")
1875}
1876
1877fn collect_repository_files(
1878 root: &Path,
1879 config: &CodeGraphExtractorConfig,
1880 matcher: &GitignoreMatcher,
1881 diagnostics: &mut Vec<CodeGraphDiagnostic>,
1882) -> Result<Vec<RepoFile>> {
1883 let include_exts: HashSet<String> = config
1884 .include_extensions
1885 .iter()
1886 .map(|ext| ext.trim_start_matches('.').to_ascii_lowercase())
1887 .collect();
1888
1889 let exclude_dirs: HashSet<String> = config.exclude_dirs.iter().cloned().collect();
1890
1891 let mut out = Vec::new();
1892 collect_repository_files_recursive(
1893 root,
1894 root,
1895 &include_exts,
1896 &exclude_dirs,
1897 config,
1898 matcher,
1899 diagnostics,
1900 &mut out,
1901 )?;
1902
1903 out.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
1904 Ok(out)
1905}
1906
1907#[allow(clippy::too_many_arguments)]
1908fn collect_repository_files_recursive(
1909 root: &Path,
1910 current: &Path,
1911 include_exts: &HashSet<String>,
1912 exclude_dirs: &HashSet<String>,
1913 config: &CodeGraphExtractorConfig,
1914 matcher: &GitignoreMatcher,
1915 diagnostics: &mut Vec<CodeGraphDiagnostic>,
1916 out: &mut Vec<RepoFile>,
1917) -> Result<()> {
1918 let read_dir = match fs::read_dir(current) {
1919 Ok(rd) => rd,
1920 Err(err) => {
1921 diagnostics.push(CodeGraphDiagnostic::warning(
1922 "CG2004",
1923 format!("failed to read directory {}: {}", current.display(), err),
1924 ));
1925 return Ok(());
1926 }
1927 };
1928
1929 let mut entries = Vec::new();
1930 for entry in read_dir {
1931 match entry {
1932 Ok(e) => entries.push(e),
1933 Err(err) => diagnostics.push(CodeGraphDiagnostic::warning(
1934 "CG2005",
1935 format!("failed to access directory entry: {}", err),
1936 )),
1937 }
1938 }
1939
1940 entries.sort_by_key(|entry| entry.file_name());
1941
1942 for entry in entries {
1943 let path = entry.path();
1944 let rel = normalize_path(
1945 path.strip_prefix(root)
1946 .with_context(|| format!("failed to strip prefix {}", root.display()))?,
1947 );
1948
1949 if rel.is_empty() {
1950 continue;
1951 }
1952
1953 let file_type = match entry.file_type() {
1954 Ok(ft) => ft,
1955 Err(err) => {
1956 diagnostics.push(CodeGraphDiagnostic::warning(
1957 "CG2005",
1958 format!("failed to read file type for {}: {}", rel, err),
1959 ));
1960 continue;
1961 }
1962 };
1963
1964 if !config.include_hidden && is_hidden_path(&rel) {
1965 continue;
1966 }
1967
1968 if file_type.is_dir() {
1969 let dir_name = path
1970 .file_name()
1971 .map(|n| n.to_string_lossy().to_string())
1972 .unwrap_or_default();
1973
1974 if exclude_dirs.contains(&dir_name) || matcher.is_ignored(&rel, true) {
1975 continue;
1976 }
1977
1978 collect_repository_files_recursive(
1979 root,
1980 &path,
1981 include_exts,
1982 exclude_dirs,
1983 config,
1984 matcher,
1985 diagnostics,
1986 out,
1987 )?;
1988 continue;
1989 }
1990
1991 if !file_type.is_file() {
1992 continue;
1993 }
1994
1995 if matcher.is_ignored(&rel, false) {
1996 continue;
1997 }
1998
1999 let ext = path
2000 .extension()
2001 .and_then(|e| e.to_str())
2002 .map(|e| e.to_ascii_lowercase())
2003 .unwrap_or_default();
2004
2005 if !include_exts.contains(&ext) {
2006 continue;
2007 }
2008
2009 if let Some(language) = extension_language(&ext) {
2010 out.push(RepoFile {
2011 absolute_path: path,
2012 relative_path: rel,
2013 language,
2014 });
2015 } else {
2016 diagnostics.push(
2017 CodeGraphDiagnostic::info("CG2007", format!("unsupported extension '.{}'", ext))
2018 .with_path(rel),
2019 );
2020 }
2021 }
2022
2023 Ok(())
2024}
2025
2026fn extension_language(ext: &str) -> Option<CodeLanguage> {
2027 match ext {
2028 "rs" => Some(CodeLanguage::Rust),
2029 "py" => Some(CodeLanguage::Python),
2030 "ts" | "tsx" => Some(CodeLanguage::TypeScript),
2031 "js" | "jsx" => Some(CodeLanguage::JavaScript),
2032 _ => None,
2033 }
2034}
2035
2036fn unique_symbol_logical_key(
2037 file_path: &str,
2038 symbol_name: &str,
2039 line: usize,
2040 used: &mut HashSet<String>,
2041) -> String {
2042 let base = format!("symbol:{}::{}", file_path, symbol_name);
2043 if used.insert(base.clone()) {
2044 return base;
2045 }
2046
2047 let with_line = format!("{}#{}", base, line);
2048 if used.insert(with_line.clone()) {
2049 return with_line;
2050 }
2051
2052 let mut n = 2usize;
2053 loop {
2054 let candidate = format!("{}#{}", with_line, n);
2055 if used.insert(candidate.clone()) {
2056 return candidate;
2057 }
2058 n += 1;
2059 }
2060}
2061
2062fn ancestor_directories(path: &str) -> Vec<String> {
2063 let parts: Vec<&str> = path.split('/').collect();
2064 if parts.len() <= 1 {
2065 return Vec::new();
2066 }
2067
2068 let mut dirs = Vec::new();
2069 for i in 1..parts.len() {
2070 let dir = parts[..i].join("/");
2071 if !dir.is_empty() {
2072 dirs.push(dir);
2073 }
2074 }
2075 dirs
2076}
2077
2078fn parent_directory_id(dir: &str, directory_ids: &BTreeMap<String, BlockId>) -> Option<BlockId> {
2079 let parent = parent_directory(dir);
2080 if parent.is_empty() {
2081 None
2082 } else {
2083 directory_ids.get(&parent).copied()
2084 }
2085}
2086
2087fn parent_id_for_file(
2088 path: &str,
2089 repo_id: BlockId,
2090 directory_ids: &BTreeMap<String, BlockId>,
2091) -> BlockId {
2092 let parent_dir = parent_directory(path);
2093 if parent_dir.is_empty() {
2094 repo_id
2095 } else {
2096 directory_ids.get(&parent_dir).copied().unwrap_or(repo_id)
2097 }
2098}
2099
2100fn parent_directory(path: &str) -> String {
2101 match path.rsplit_once('/') {
2102 Some((parent, _)) => parent.to_string(),
2103 None => String::new(),
2104 }
2105}
2106
2107fn normalize_relative_join(base: &str, relative: &str) -> String {
2108 let mut segments = Vec::new();
2109
2110 if !base.is_empty() {
2111 segments.extend(
2112 base.split('/')
2113 .filter(|s| !s.is_empty())
2114 .map(|s| s.to_string()),
2115 );
2116 }
2117
2118 for part in relative.split('/') {
2119 match part {
2120 "" | "." => {}
2121 ".." => {
2122 segments.pop();
2123 }
2124 other => segments.push(other.to_string()),
2125 }
2126 }
2127
2128 segments.join("/")
2129}
2130
2131fn ascend_directory(path: &str, levels: usize) -> String {
2132 let mut parts: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
2133 for _ in 0..levels {
2134 if parts.is_empty() {
2135 break;
2136 }
2137 parts.pop();
2138 }
2139 parts.join("/")
2140}
2141
2142fn sanitize_identifier(raw: &str) -> String {
2143 raw.chars()
2144 .map(|c| {
2145 if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
2146 c
2147 } else {
2148 '_'
2149 }
2150 })
2151 .collect()
2152}
2153
2154fn normalize_path(path: &Path) -> String {
2155 path.components()
2156 .filter_map(|component| {
2157 let s = component.as_os_str().to_string_lossy();
2158 if s == "." || s.is_empty() {
2159 None
2160 } else {
2161 Some(s.to_string())
2162 }
2163 })
2164 .collect::<Vec<_>>()
2165 .join("/")
2166}
2167
2168fn is_hidden_path(path: &str) -> bool {
2169 path.split('/').any(|part| part.starts_with('.'))
2170}
2171
2172#[derive(Debug, Clone)]
2173struct GitignoreMatcher {
2174 rules: Vec<GitignoreRule>,
2175}
2176
2177#[derive(Debug, Clone)]
2178struct GitignoreRule {
2179 regex: Regex,
2180 directory_only: bool,
2181}
2182
2183impl GitignoreMatcher {
2184 fn from_repository(repo_root: &Path) -> Result<Self> {
2185 let gitignore_path = repo_root.join(".gitignore");
2186 if !gitignore_path.exists() {
2187 return Ok(Self { rules: Vec::new() });
2188 }
2189
2190 let raw = fs::read_to_string(&gitignore_path)
2191 .with_context(|| format!("failed to read {}", gitignore_path.display()))?;
2192
2193 let mut rules = Vec::new();
2194 for line in raw.lines() {
2195 let trimmed = line.trim();
2196 if trimmed.is_empty() || trimmed.starts_with('#') || trimmed.starts_with('!') {
2197 continue;
2198 }
2199
2200 if let Some(rule) = GitignoreRule::from_pattern(trimmed) {
2201 rules.push(rule);
2202 }
2203 }
2204
2205 Ok(Self { rules })
2206 }
2207
2208 fn is_ignored(&self, rel_path: &str, is_dir: bool) -> bool {
2209 for rule in &self.rules {
2210 if rule.directory_only && !is_dir {
2211 continue;
2212 }
2213 if rule.regex.is_match(rel_path) {
2214 return true;
2215 }
2216 }
2217 false
2218 }
2219}
2220
2221impl GitignoreRule {
2222 fn from_pattern(pattern: &str) -> Option<Self> {
2223 let directory_only = pattern.ends_with('/');
2224 let mut core = pattern.trim_end_matches('/').trim_start_matches("./");
2225
2226 if core.is_empty() {
2227 return None;
2228 }
2229
2230 let anchored = core.starts_with('/');
2231 core = core.trim_start_matches('/');
2232
2233 let mut regex = String::new();
2234 if anchored {
2235 regex.push('^');
2236 } else {
2237 regex.push_str("(^|.*/)");
2238 }
2239
2240 regex.push_str(&glob_to_regex(core));
2241
2242 if directory_only {
2243 regex.push_str("($|/.*)");
2244 } else {
2245 regex.push('$');
2246 }
2247
2248 let compiled = Regex::new(®ex).ok()?;
2249
2250 Some(Self {
2251 regex: compiled,
2252 directory_only,
2253 })
2254 }
2255}
2256
2257fn glob_to_regex(glob: &str) -> String {
2258 let mut out = String::new();
2259 let mut chars = glob.chars().peekable();
2260
2261 while let Some(ch) = chars.next() {
2262 match ch {
2263 '*' => {
2264 if matches!(chars.peek(), Some('*')) {
2265 chars.next();
2266 out.push_str(".*");
2267 } else {
2268 out.push_str("[^/]*");
2269 }
2270 }
2271 '?' => out.push_str("[^/]"),
2272 '.' | '+' | '(' | ')' | '|' | '^' | '$' | '{' | '}' | '[' | ']' | '\\' => {
2273 out.push('\\');
2274 out.push(ch);
2275 }
2276 _ => out.push(ch),
2277 }
2278 }
2279
2280 out
2281}
2282
2283#[cfg(test)]
2284mod tests {
2285 use super::*;
2286 use std::io::Write;
2287 use tempfile::tempdir;
2288
2289 #[test]
2290 fn test_validate_profile_detects_missing_markers() {
2291 let doc = Document::create();
2292 let result = validate_code_graph_profile(&doc);
2293 assert!(!result.valid);
2294 assert!(result
2295 .diagnostics
2296 .iter()
2297 .any(|d| d.code == "CG1001" || d.code == "CG1002"));
2298 }
2299
2300 #[test]
2301 fn test_canonical_fingerprint_stable_for_equivalent_docs() {
2302 let dir = tempdir().unwrap();
2303 let root = dir.path();
2304 fs::create_dir_all(root.join("src")).unwrap();
2305 fs::write(root.join("src/lib.rs"), "pub fn a() {}\n").unwrap();
2306
2307 let input = CodeGraphBuildInput {
2308 repository_path: root.to_path_buf(),
2309 commit_hash: "abc123".to_string(),
2310 config: CodeGraphExtractorConfig::default(),
2311 };
2312
2313 let first = build_code_graph(&input).unwrap();
2314 let second = build_code_graph(&input).unwrap();
2315
2316 assert_eq!(first.canonical_fingerprint, second.canonical_fingerprint);
2317 assert_eq!(
2318 canonical_codegraph_json(&first.document).unwrap(),
2319 canonical_codegraph_json(&second.document).unwrap()
2320 );
2321 }
2322
2323 #[test]
2324 fn test_portable_document_roundtrip_preserves_fingerprint() {
2325 let dir = tempdir().unwrap();
2326 fs::create_dir_all(dir.path().join("pkg")).unwrap();
2327 fs::write(
2328 dir.path().join("pkg/main.py"),
2329 "from .util import helper\n\ndef run():\n return helper()\n",
2330 )
2331 .unwrap();
2332 fs::write(
2333 dir.path().join("pkg/util.py"),
2334 "def helper():\n return 1\n",
2335 )
2336 .unwrap();
2337
2338 let build = build_code_graph(&CodeGraphBuildInput {
2339 repository_path: dir.path().to_path_buf(),
2340 commit_hash: "def456".to_string(),
2341 config: CodeGraphExtractorConfig::default(),
2342 })
2343 .unwrap();
2344
2345 let portable = PortableDocument::from_document(&build.document);
2346 let json = serde_json::to_string_pretty(&portable).unwrap();
2347 let decoded: PortableDocument = serde_json::from_str(&json).unwrap();
2348 let roundtripped = decoded.to_document().unwrap();
2349
2350 let fp1 = canonical_fingerprint(&build.document).unwrap();
2351 let fp2 = canonical_fingerprint(&roundtripped).unwrap();
2352 assert_eq!(fp1, fp2);
2353 }
2354
2355 #[test]
2356 fn test_unresolved_import_produces_diagnostic() {
2357 let dir = tempdir().unwrap();
2358 fs::create_dir_all(dir.path().join("src")).unwrap();
2359 fs::write(
2360 dir.path().join("src/lib.rs"),
2361 "use crate::missing::thing;\npub fn keep() {}\n",
2362 )
2363 .unwrap();
2364
2365 let build = build_code_graph(&CodeGraphBuildInput {
2366 repository_path: dir.path().to_path_buf(),
2367 commit_hash: "ghi789".to_string(),
2368 config: CodeGraphExtractorConfig::default(),
2369 })
2370 .unwrap();
2371
2372 assert!(build
2373 .diagnostics
2374 .iter()
2375 .any(|d| d.code == "CG2006" && d.severity == CodeGraphSeverity::Warning));
2376 }
2377
2378 #[test]
2379 fn test_gitignore_rule_matches() {
2380 let rule = GitignoreRule::from_pattern("target/").unwrap();
2381 assert!(rule.regex.is_match("target"));
2382 assert!(rule.regex.is_match("target/debug/app"));
2383 }
2384
2385 #[test]
2386 fn test_import_resolution_ts_relative() {
2387 let mut known = BTreeSet::new();
2388 known.insert("src/main.ts".to_string());
2389 known.insert("src/util.ts".to_string());
2390
2391 let resolved = resolve_ts_import("src/main.ts", "./util", &known);
2392 assert_eq!(resolved.as_deref(), Some("src/util.ts"));
2393 }
2394
2395 #[test]
2396 fn test_performance_smoke_medium_fixture() {
2397 let dir = tempdir().unwrap();
2398 let src = dir.path().join("src");
2399 fs::create_dir_all(&src).unwrap();
2400
2401 for i in 0..300usize {
2402 let mut file = fs::File::create(src.join(format!("m{}.rs", i))).unwrap();
2403 writeln!(file, "pub fn f{}() {{}}", i).unwrap();
2404 if i > 0 {
2405 writeln!(file, "use crate::m{}::f{};", i - 1, i - 1).unwrap();
2406 }
2407 }
2408
2409 let start = std::time::Instant::now();
2410 let build = build_code_graph(&CodeGraphBuildInput {
2411 repository_path: dir.path().to_path_buf(),
2412 commit_hash: "perf-smoke".to_string(),
2413 config: CodeGraphExtractorConfig::default(),
2414 })
2415 .unwrap();
2416 let elapsed = start.elapsed();
2417
2418 assert!(build.stats.file_nodes >= 300);
2419 assert!(elapsed.as_secs_f64() < 3.0, "elapsed: {elapsed:?}");
2420 }
2421}