Skip to main content

ucp_api/
codegraph.rs

1use anyhow::{anyhow, Context, Result};
2use regex::Regex;
3use serde::{Deserialize, Serialize};
4use serde_json::json;
5use sha2::{Digest, Sha256};
6use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
7use std::fs;
8use std::path::{Path, PathBuf};
9use std::str::FromStr;
10use tree_sitter::{Language, Node, Parser};
11use ucm_core::{
12    normalize::{canonical_json, normalize_content},
13    Block, BlockId, Content, Document, DocumentId, DocumentMetadata, Edge, EdgeType,
14};
15use ucp_llm::IdMapper;
16
17pub const CODEGRAPH_PROFILE: &str = "codegraph";
18pub const CODEGRAPH_PROFILE_VERSION: &str = "v1";
19pub const CODEGRAPH_PROFILE_MARKER: &str = "codegraph.v1";
20pub const CODEGRAPH_EXTRACTOR_VERSION: &str = "ucp-codegraph-extractor.v1";
21
22const META_NODE_CLASS: &str = "node_class";
23const META_LOGICAL_KEY: &str = "logical_key";
24const META_PATH: &str = "path";
25const META_LANGUAGE: &str = "language";
26const META_SYMBOL_KIND: &str = "symbol_kind";
27const META_SYMBOL_NAME: &str = "name";
28const META_SPAN: &str = "span";
29const META_EXPORTED: &str = "exported";
30
31#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
32#[serde(rename_all = "snake_case")]
33pub enum CodeGraphSeverity {
34    Error,
35    Warning,
36    Info,
37}
38
39#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
40pub struct CodeGraphDiagnostic {
41    pub severity: CodeGraphSeverity,
42    pub code: String,
43    pub message: String,
44    #[serde(skip_serializing_if = "Option::is_none")]
45    pub path: Option<String>,
46    #[serde(skip_serializing_if = "Option::is_none")]
47    pub logical_key: Option<String>,
48}
49
50impl CodeGraphDiagnostic {
51    fn error(code: &str, message: impl Into<String>) -> Self {
52        Self {
53            severity: CodeGraphSeverity::Error,
54            code: code.to_string(),
55            message: message.into(),
56            path: None,
57            logical_key: None,
58        }
59    }
60
61    fn warning(code: &str, message: impl Into<String>) -> Self {
62        Self {
63            severity: CodeGraphSeverity::Warning,
64            code: code.to_string(),
65            message: message.into(),
66            path: None,
67            logical_key: None,
68        }
69    }
70
71    fn info(code: &str, message: impl Into<String>) -> Self {
72        Self {
73            severity: CodeGraphSeverity::Info,
74            code: code.to_string(),
75            message: message.into(),
76            path: None,
77            logical_key: None,
78        }
79    }
80
81    fn with_path(mut self, path: impl Into<String>) -> Self {
82        self.path = Some(path.into());
83        self
84    }
85
86    fn with_logical_key(mut self, logical_key: impl Into<String>) -> Self {
87        self.logical_key = Some(logical_key.into());
88        self
89    }
90}
91
92#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
93pub struct CodeGraphValidationResult {
94    pub valid: bool,
95    pub diagnostics: Vec<CodeGraphDiagnostic>,
96}
97
98#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
99pub struct CodeGraphStats {
100    pub total_nodes: usize,
101    pub repository_nodes: usize,
102    pub directory_nodes: usize,
103    pub file_nodes: usize,
104    pub symbol_nodes: usize,
105    pub total_edges: usize,
106    pub reference_edges: usize,
107    pub export_edges: usize,
108    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
109    pub languages: BTreeMap<String, usize>,
110}
111
112#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
113#[serde(rename_all = "snake_case")]
114pub enum CodeGraphBuildStatus {
115    Success,
116    PartialSuccess,
117    FailedValidation,
118}
119
120#[derive(Debug, Clone)]
121pub struct CodeGraphBuildResult {
122    pub document: Document,
123    pub diagnostics: Vec<CodeGraphDiagnostic>,
124    pub stats: CodeGraphStats,
125    pub profile_version: String,
126    pub canonical_fingerprint: String,
127    pub status: CodeGraphBuildStatus,
128}
129
130impl CodeGraphBuildResult {
131    pub fn has_errors(&self) -> bool {
132        self.diagnostics
133            .iter()
134            .any(|d| d.severity == CodeGraphSeverity::Error)
135    }
136}
137
138#[derive(Debug, Clone, Serialize, Deserialize)]
139pub struct CodeGraphBuildInput {
140    pub repository_path: PathBuf,
141    pub commit_hash: String,
142    #[serde(default)]
143    pub config: CodeGraphExtractorConfig,
144}
145
146#[derive(Debug, Clone, Serialize, Deserialize)]
147pub struct CodeGraphExtractorConfig {
148    #[serde(default = "default_include_extensions")]
149    pub include_extensions: Vec<String>,
150    #[serde(default = "default_exclude_dirs")]
151    pub exclude_dirs: Vec<String>,
152    #[serde(default = "default_continue_on_parse_error")]
153    pub continue_on_parse_error: bool,
154    #[serde(default)]
155    pub include_hidden: bool,
156    #[serde(default = "default_max_file_bytes")]
157    pub max_file_bytes: usize,
158    #[serde(default = "default_emit_export_edges")]
159    pub emit_export_edges: bool,
160}
161
162impl Default for CodeGraphExtractorConfig {
163    fn default() -> Self {
164        Self {
165            include_extensions: default_include_extensions(),
166            exclude_dirs: default_exclude_dirs(),
167            continue_on_parse_error: default_continue_on_parse_error(),
168            include_hidden: false,
169            max_file_bytes: default_max_file_bytes(),
170            emit_export_edges: default_emit_export_edges(),
171        }
172    }
173}
174
175fn default_include_extensions() -> Vec<String> {
176    vec!["rs", "py", "ts", "tsx", "js", "jsx"]
177        .into_iter()
178        .map(|s| s.to_string())
179        .collect()
180}
181
182fn default_exclude_dirs() -> Vec<String> {
183    vec![".git", "target", "node_modules", "dist", "build"]
184        .into_iter()
185        .map(|s| s.to_string())
186        .collect()
187}
188
189fn default_continue_on_parse_error() -> bool {
190    true
191}
192
193fn default_max_file_bytes() -> usize {
194    2 * 1024 * 1024
195}
196
197fn default_emit_export_edges() -> bool {
198    true
199}
200
201#[derive(Debug, Clone, Serialize, Deserialize)]
202pub struct PortableDocument {
203    pub id: String,
204    pub root: String,
205    pub structure: BTreeMap<String, Vec<String>>,
206    pub blocks: BTreeMap<String, Block>,
207    pub metadata: DocumentMetadata,
208    pub version: u64,
209}
210
211impl PortableDocument {
212    pub fn from_document(doc: &Document) -> Self {
213        let mut structure = BTreeMap::new();
214        for (parent, children) in &doc.structure {
215            let mut sorted = children.clone();
216            sorted.sort_by_key(|id| id.to_string());
217            structure.insert(
218                parent.to_string(),
219                sorted.into_iter().map(|id| id.to_string()).collect(),
220            );
221        }
222
223        let mut blocks = BTreeMap::new();
224        for (id, block) in &doc.blocks {
225            blocks.insert(id.to_string(), block.clone());
226        }
227
228        Self {
229            id: doc.id.0.clone(),
230            root: doc.root.to_string(),
231            structure,
232            blocks,
233            metadata: doc.metadata.clone(),
234            version: doc.version.counter,
235        }
236    }
237
238    pub fn to_document(&self) -> Result<Document> {
239        let root = BlockId::from_str(&self.root)
240            .map_err(|_| anyhow!("invalid root block id: {}", self.root))?;
241
242        let mut structure: HashMap<BlockId, Vec<BlockId>> = HashMap::new();
243        for (parent, children) in &self.structure {
244            let parent_id = BlockId::from_str(parent)
245                .map_err(|_| anyhow!("invalid structure parent id: {}", parent))?;
246            let mut parsed_children = Vec::with_capacity(children.len());
247            for child in children {
248                let child_id = BlockId::from_str(child)
249                    .map_err(|_| anyhow!("invalid structure child id: {}", child))?;
250                parsed_children.push(child_id);
251            }
252            structure.insert(parent_id, parsed_children);
253        }
254
255        let mut blocks: HashMap<BlockId, Block> = HashMap::new();
256        for (id, block) in &self.blocks {
257            let block_id = BlockId::from_str(id)
258                .map_err(|_| anyhow!("invalid block id in blocks map: {}", id))?;
259            blocks.insert(block_id, block.clone());
260        }
261
262        let mut doc = Document {
263            id: DocumentId::new(self.id.clone()),
264            root,
265            structure,
266            blocks,
267            metadata: self.metadata.clone(),
268            indices: Default::default(),
269            edge_index: Default::default(),
270            version: ucm_core::DocumentVersion {
271                counter: self.version,
272                timestamp: deterministic_timestamp(),
273                state_hash: [0u8; 8],
274            },
275        };
276        doc.rebuild_indices();
277        Ok(doc)
278    }
279}
280
281pub fn build_code_graph(input: &CodeGraphBuildInput) -> Result<CodeGraphBuildResult> {
282    let repo_root = input
283        .repository_path
284        .canonicalize()
285        .with_context(|| format!("failed to resolve repo path {:?}", input.repository_path))?;
286
287    if !repo_root.is_dir() {
288        return Err(anyhow!(
289            "repository path is not a directory: {}",
290            repo_root.display()
291        ));
292    }
293
294    let mut diagnostics = Vec::new();
295    let matcher = GitignoreMatcher::from_repository(&repo_root)?;
296    let files = collect_repository_files(&repo_root, &input.config, &matcher, &mut diagnostics)?;
297
298    let repo_name = repo_root
299        .file_name()
300        .map(|s| s.to_string_lossy().to_string())
301        .unwrap_or_else(|| "repository".to_string());
302
303    let mut doc = Document::new(DocumentId::new(format!(
304        "codegraph:{}:{}",
305        sanitize_identifier(&repo_name),
306        sanitize_identifier(&input.commit_hash)
307    )));
308
309    initialize_document_metadata(&mut doc, &repo_root, &repo_name, &input.commit_hash);
310
311    let repo_block = make_repository_block(&repo_name, &input.commit_hash);
312    let root_id = doc.root;
313    let repo_block_id = doc.add_block(repo_block, &root_id)?;
314
315    let mut directories = BTreeSet::new();
316    for file in &files {
317        for dir in ancestor_directories(&file.relative_path) {
318            directories.insert(dir);
319        }
320    }
321
322    let mut directory_ids: BTreeMap<String, BlockId> = BTreeMap::new();
323    for dir in directories {
324        let parent_id = parent_directory_id(&dir, &directory_ids).unwrap_or(repo_block_id);
325        let block = make_directory_block(&dir);
326        let block_id = doc.add_block(block, &parent_id)?;
327        directory_ids.insert(dir, block_id);
328    }
329
330    let mut file_ids: BTreeMap<String, BlockId> = BTreeMap::new();
331    let mut file_analyses = Vec::new();
332    let mut used_symbol_keys: HashSet<String> = HashSet::new();
333
334    for file in files {
335        let parent_id = parent_id_for_file(&file.relative_path, repo_block_id, &directory_ids);
336
337        let source = match fs::read_to_string(&file.absolute_path) {
338            Ok(s) => s,
339            Err(err) => {
340                let diag = CodeGraphDiagnostic::error(
341                    "CG2003",
342                    format!("failed to read source file: {}", err),
343                )
344                .with_path(file.relative_path.clone());
345                diagnostics.push(diag);
346                if input.config.continue_on_parse_error {
347                    continue;
348                }
349                return Err(anyhow!(
350                    "failed to read source file {}: {}",
351                    file.relative_path,
352                    err
353                ));
354            }
355        };
356
357        if source.len() > input.config.max_file_bytes {
358            diagnostics.push(
359                CodeGraphDiagnostic::warning(
360                    "CG2008",
361                    format!(
362                        "file skipped due to size limit ({} bytes > {} bytes)",
363                        source.len(),
364                        input.config.max_file_bytes
365                    ),
366                )
367                .with_path(file.relative_path.clone()),
368            );
369            continue;
370        }
371
372        let file_block = make_file_block(&file.relative_path, file.language.as_str());
373        let file_block_id = doc.add_block(file_block, &parent_id)?;
374        file_ids.insert(file.relative_path.clone(), file_block_id);
375
376        let analysis = analyze_file(&file.relative_path, &source, file.language);
377        for diag in &analysis.diagnostics {
378            diagnostics.push(diag.clone().with_path(file.relative_path.clone()));
379        }
380
381        for symbol in &analysis.symbols {
382            let logical_key = unique_symbol_logical_key(
383                &file.relative_path,
384                &symbol.name,
385                symbol.start_line,
386                &mut used_symbol_keys,
387            );
388            let symbol_block = make_symbol_block(
389                &logical_key,
390                &file.relative_path,
391                file.language.as_str(),
392                symbol,
393            );
394            let symbol_id = doc.add_block(symbol_block, &file_block_id)?;
395
396            if symbol.exported && input.config.emit_export_edges {
397                let mut edge = Edge::new(EdgeType::Custom("exports".to_string()), symbol_id);
398                edge.metadata
399                    .custom
400                    .insert("relation".to_string(), json!("exports"));
401                edge.metadata
402                    .custom
403                    .insert("symbol".to_string(), json!(symbol.name.clone()));
404                if let Some(source_block) = doc.get_block_mut(&file_block_id) {
405                    source_block.edges.push(edge);
406                }
407            }
408        }
409
410        file_analyses.push(FileAnalysisRecord {
411            file: file.relative_path,
412            language: file.language,
413            imports: analysis.imports,
414        });
415    }
416
417    let known_files: BTreeSet<String> = file_ids.keys().cloned().collect();
418    let mut pending_reference_edges: BTreeSet<(String, String, String)> = BTreeSet::new();
419
420    for record in &file_analyses {
421        for import in &record.imports {
422            match resolve_import(&record.file, &record.language, &import.module, &known_files) {
423                Some(target) if target != record.file => {
424                    pending_reference_edges.insert((
425                        record.file.clone(),
426                        target,
427                        import.module.clone(),
428                    ));
429                }
430                Some(_) => {}
431                None => {
432                    diagnostics.push(
433                        CodeGraphDiagnostic::warning(
434                            "CG2006",
435                            format!("unresolved import '{}'", import.module),
436                        )
437                        .with_path(record.file.clone()),
438                    );
439                }
440            }
441        }
442    }
443
444    for (source_path, target_path, raw_import) in pending_reference_edges {
445        let (Some(source_id), Some(target_id)) =
446            (file_ids.get(&source_path), file_ids.get(&target_path))
447        else {
448            continue;
449        };
450        let mut edge = Edge::new(EdgeType::References, *target_id);
451        edge.metadata
452            .custom
453            .insert("relation".to_string(), json!("imports"));
454        edge.metadata
455            .custom
456            .insert("raw_import".to_string(), json!(raw_import));
457        if let Some(source_block) = doc.get_block_mut(source_id) {
458            source_block.edges.push(edge);
459        }
460    }
461
462    sort_structure_children_by_logical_key(&mut doc);
463    sort_edges(&mut doc);
464    normalize_temporal_fields(&mut doc);
465    doc.rebuild_indices();
466
467    let mut validation = validate_code_graph_profile(&doc);
468    diagnostics.append(&mut validation.diagnostics);
469
470    let fingerprint = canonical_fingerprint(&doc)?;
471    let stats = compute_stats(&doc);
472
473    let has_profile_errors = diagnostics
474        .iter()
475        .any(|d| d.severity == CodeGraphSeverity::Error && d.code.starts_with("CG100"));
476    let has_non_info = diagnostics
477        .iter()
478        .any(|d| d.severity != CodeGraphSeverity::Info);
479
480    let status = if has_profile_errors {
481        CodeGraphBuildStatus::FailedValidation
482    } else if has_non_info {
483        CodeGraphBuildStatus::PartialSuccess
484    } else {
485        CodeGraphBuildStatus::Success
486    };
487
488    Ok(CodeGraphBuildResult {
489        document: doc,
490        diagnostics,
491        stats,
492        profile_version: CODEGRAPH_PROFILE_MARKER.to_string(),
493        canonical_fingerprint: fingerprint,
494        status,
495    })
496}
497
498pub fn validate_code_graph_profile(doc: &Document) -> CodeGraphValidationResult {
499    let mut diagnostics = Vec::new();
500
501    match doc.metadata.custom.get("profile").and_then(|v| v.as_str()) {
502        Some(CODEGRAPH_PROFILE) => {}
503        Some(other) => diagnostics.push(CodeGraphDiagnostic::error(
504            "CG1001",
505            format!(
506                "invalid profile marker '{}', expected '{}'",
507                other, CODEGRAPH_PROFILE
508            ),
509        )),
510        None => diagnostics.push(CodeGraphDiagnostic::error(
511            "CG1001",
512            "missing document metadata.custom.profile marker",
513        )),
514    }
515
516    match doc
517        .metadata
518        .custom
519        .get("profile_version")
520        .and_then(|v| v.as_str())
521    {
522        Some(CODEGRAPH_PROFILE_VERSION) => {}
523        Some(other) => diagnostics.push(CodeGraphDiagnostic::error(
524            "CG1002",
525            format!(
526                "invalid profile version '{}', expected '{}'",
527                other, CODEGRAPH_PROFILE_VERSION
528            ),
529        )),
530        None => diagnostics.push(CodeGraphDiagnostic::error(
531            "CG1002",
532            "missing document metadata.custom.profile_version marker",
533        )),
534    }
535
536    let mut logical_keys: HashMap<String, Vec<BlockId>> = HashMap::new();
537    let mut class_counts: HashMap<String, usize> = HashMap::new();
538
539    for (id, block) in &doc.blocks {
540        if *id == doc.root {
541            continue;
542        }
543
544        let class = node_class(block);
545        let Some(class_name) = class else {
546            diagnostics.push(
547                CodeGraphDiagnostic::error(
548                    "CG1010",
549                    "block missing node_class metadata (or custom semantic role)",
550                )
551                .with_path(block_path(block).unwrap_or_else(|| id.to_string())),
552            );
553            continue;
554        };
555
556        *class_counts.entry(class_name.clone()).or_default() += 1;
557
558        match block_logical_key(block) {
559            Some(logical_key) => {
560                logical_keys.entry(logical_key).or_default().push(*id);
561            }
562            None => diagnostics.push(
563                CodeGraphDiagnostic::error("CG1011", "missing required logical_key metadata")
564                    .with_path(block_path(block).unwrap_or_else(|| id.to_string())),
565            ),
566        }
567
568        validate_required_metadata(&class_name, block, &mut diagnostics);
569    }
570
571    for class in ["repository", "directory", "file", "symbol"] {
572        if class_counts.get(class).copied().unwrap_or(0) == 0 {
573            diagnostics.push(CodeGraphDiagnostic::warning(
574                "CG1012",
575                format!("profile has no '{}' nodes", class),
576            ));
577        }
578    }
579
580    for (logical_key, ids) in logical_keys {
581        if ids.len() > 1 {
582            diagnostics.push(
583                CodeGraphDiagnostic::error(
584                    "CG1013",
585                    format!(
586                        "logical_key '{}' is duplicated by {} blocks",
587                        logical_key,
588                        ids.len()
589                    ),
590                )
591                .with_logical_key(logical_key),
592            );
593        }
594    }
595
596    let logical_by_id = logical_key_index(doc);
597
598    for (source_id, block) in &doc.blocks {
599        let Some(source_class) = node_class(block) else {
600            continue;
601        };
602        for edge in &block.edges {
603            let target_block = match doc.get_block(&edge.target) {
604                Some(b) => b,
605                None => {
606                    diagnostics.push(
607                        CodeGraphDiagnostic::error(
608                            "CG1014",
609                            format!("edge references missing target block {}", edge.target),
610                        )
611                        .with_logical_key(
612                            logical_by_id
613                                .get(source_id)
614                                .cloned()
615                                .unwrap_or_else(|| source_id.to_string()),
616                        ),
617                    );
618                    continue;
619                }
620            };
621
622            let target_class = node_class(target_block).unwrap_or_default();
623
624            match &edge.edge_type {
625                EdgeType::References => {
626                    if source_class != "file" || target_class != "file" {
627                        diagnostics.push(
628                            CodeGraphDiagnostic::error(
629                                "CG1015",
630                                "references edges must connect file -> file",
631                            )
632                            .with_logical_key(
633                                logical_by_id
634                                    .get(source_id)
635                                    .cloned()
636                                    .unwrap_or_else(|| source_id.to_string()),
637                            ),
638                        );
639                    }
640                }
641                EdgeType::Custom(name) if name == "exports" => {
642                    if source_class != "file" || target_class != "symbol" {
643                        diagnostics.push(
644                            CodeGraphDiagnostic::error(
645                                "CG1016",
646                                "exports edges must connect file -> symbol",
647                            )
648                            .with_logical_key(
649                                logical_by_id
650                                    .get(source_id)
651                                    .cloned()
652                                    .unwrap_or_else(|| source_id.to_string()),
653                            ),
654                        );
655                    }
656                }
657                _ => {}
658            }
659        }
660    }
661
662    CodeGraphValidationResult {
663        valid: diagnostics
664            .iter()
665            .all(|d| d.severity != CodeGraphSeverity::Error),
666        diagnostics,
667    }
668}
669
670pub fn canonical_codegraph_json(doc: &Document) -> Result<String> {
671    let logical_by_id = logical_key_index(doc);
672
673    let mut node_entries = Vec::new();
674    for (id, block) in &doc.blocks {
675        if *id == doc.root {
676            continue;
677        }
678
679        let logical_key = logical_by_id
680            .get(id)
681            .cloned()
682            .unwrap_or_else(|| id.to_string());
683
684        let class = node_class(block).unwrap_or_else(|| "unknown".to_string());
685        let metadata = normalized_block_metadata(block);
686
687        node_entries.push(json!({
688            "logical_key": logical_key,
689            "node_class": class,
690            "semantic_role": block.metadata.semantic_role.as_ref().map(|r| r.to_string()),
691            "content_type": block.content.type_tag(),
692            "content": normalize_content(&block.content),
693            "metadata": metadata,
694        }));
695    }
696
697    node_entries.sort_by(|a, b| {
698        let ak = a
699            .get("logical_key")
700            .and_then(|v| v.as_str())
701            .unwrap_or_default();
702        let bk = b
703            .get("logical_key")
704            .and_then(|v| v.as_str())
705            .unwrap_or_default();
706        ak.cmp(bk)
707    });
708
709    let mut structure_entries = Vec::new();
710    for (parent, children) in &doc.structure {
711        let parent_key = logical_by_id
712            .get(parent)
713            .cloned()
714            .unwrap_or_else(|| parent.to_string());
715
716        let mut child_keys: Vec<String> = children
717            .iter()
718            .map(|child| {
719                logical_by_id
720                    .get(child)
721                    .cloned()
722                    .unwrap_or_else(|| child.to_string())
723            })
724            .collect();
725        child_keys.sort();
726
727        structure_entries.push(json!({
728            "parent": parent_key,
729            "children": child_keys,
730        }));
731    }
732
733    structure_entries.sort_by(|a, b| {
734        let ak = a.get("parent").and_then(|v| v.as_str()).unwrap_or_default();
735        let bk = b.get("parent").and_then(|v| v.as_str()).unwrap_or_default();
736        ak.cmp(bk)
737    });
738
739    let mut edge_entries = Vec::new();
740    for (source_id, block) in &doc.blocks {
741        let source_key = logical_by_id
742            .get(source_id)
743            .cloned()
744            .unwrap_or_else(|| source_id.to_string());
745
746        for edge in &block.edges {
747            let target_key = logical_by_id
748                .get(&edge.target)
749                .cloned()
750                .unwrap_or_else(|| edge.target.to_string());
751            edge_entries.push(json!({
752                "source": source_key,
753                "edge_type": edge.edge_type.as_str(),
754                "target": target_key,
755                "metadata": normalized_edge_metadata(edge),
756            }));
757        }
758    }
759
760    edge_entries.sort_by(|a, b| {
761        let a_source = a.get("source").and_then(|v| v.as_str()).unwrap_or_default();
762        let b_source = b.get("source").and_then(|v| v.as_str()).unwrap_or_default();
763        a_source
764            .cmp(b_source)
765            .then_with(|| {
766                a.get("edge_type")
767                    .and_then(|v| v.as_str())
768                    .unwrap_or_default()
769                    .cmp(
770                        b.get("edge_type")
771                            .and_then(|v| v.as_str())
772                            .unwrap_or_default(),
773                    )
774            })
775            .then_with(|| {
776                a.get("target")
777                    .and_then(|v| v.as_str())
778                    .unwrap_or_default()
779                    .cmp(b.get("target").and_then(|v| v.as_str()).unwrap_or_default())
780            })
781    });
782
783    let canonical = json!({
784        "profile": CODEGRAPH_PROFILE,
785        "profile_version": CODEGRAPH_PROFILE_VERSION,
786        "nodes": node_entries,
787        "structure": structure_entries,
788        "edges": edge_entries,
789        "document_metadata": normalized_document_metadata(doc),
790    });
791
792    Ok(canonical_json(&canonical))
793}
794
795pub fn canonical_fingerprint(doc: &Document) -> Result<String> {
796    let canonical = canonical_codegraph_json(doc)?;
797    let mut hasher = Sha256::new();
798    hasher.update(canonical.as_bytes());
799    let digest = hasher.finalize();
800    Ok(hex::encode(digest))
801}
802
803pub fn codegraph_prompt_projection(doc: &Document) -> String {
804    let mapper = IdMapper::from_document(doc);
805    mapper.document_to_prompt(doc)
806}
807
808#[derive(Debug, Clone, Copy, PartialEq, Eq)]
809enum CodeLanguage {
810    Rust,
811    Python,
812    TypeScript,
813    JavaScript,
814}
815
816impl CodeLanguage {
817    fn as_str(self) -> &'static str {
818        match self {
819            Self::Rust => "rust",
820            Self::Python => "python",
821            Self::TypeScript => "typescript",
822            Self::JavaScript => "javascript",
823        }
824    }
825}
826
827#[derive(Debug, Clone)]
828struct RepoFile {
829    absolute_path: PathBuf,
830    relative_path: String,
831    language: CodeLanguage,
832}
833
834#[derive(Debug, Clone)]
835struct ExtractedSymbol {
836    name: String,
837    kind: String,
838    exported: bool,
839    start_line: usize,
840    start_col: usize,
841    end_line: usize,
842    end_col: usize,
843}
844
845#[derive(Debug, Clone)]
846struct ExtractedImport {
847    module: String,
848}
849
850#[derive(Debug, Clone, Default)]
851struct FileAnalysis {
852    symbols: Vec<ExtractedSymbol>,
853    imports: Vec<ExtractedImport>,
854    diagnostics: Vec<CodeGraphDiagnostic>,
855}
856
857#[derive(Debug, Clone)]
858struct FileAnalysisRecord {
859    file: String,
860    language: CodeLanguage,
861    imports: Vec<ExtractedImport>,
862}
863
864fn initialize_document_metadata(
865    doc: &mut Document,
866    repo_root: &Path,
867    repo_name: &str,
868    commit: &str,
869) {
870    doc.metadata.title = Some(format!("CodeGraph: {}", repo_name));
871    doc.metadata.description = Some("CodeGraphProfile v1 document".to_string());
872    doc.metadata.language = Some("multi".to_string());
873    doc.metadata
874        .custom
875        .insert("profile".to_string(), json!(CODEGRAPH_PROFILE));
876    doc.metadata.custom.insert(
877        "profile_version".to_string(),
878        json!(CODEGRAPH_PROFILE_VERSION),
879    );
880    doc.metadata.custom.insert(
881        "profile_marker".to_string(),
882        json!(CODEGRAPH_PROFILE_MARKER),
883    );
884    doc.metadata.custom.insert(
885        "extractor_version".to_string(),
886        json!(CODEGRAPH_EXTRACTOR_VERSION),
887    );
888    doc.metadata
889        .custom
890        .insert("commit_hash".to_string(), json!(commit));
891    doc.metadata.custom.insert(
892        "repository_path".to_string(),
893        json!(normalize_path(repo_root)),
894    );
895}
896
897fn make_repository_block(repo_name: &str, commit_hash: &str) -> Block {
898    let mut block = Block::new(
899        Content::json(json!({
900            "name": repo_name,
901            "commit": commit_hash,
902        })),
903        Some("custom.repository"),
904    );
905    block.metadata.label = Some(repo_name.to_string());
906    block
907        .metadata
908        .custom
909        .insert(META_NODE_CLASS.to_string(), json!("repository"));
910    block.metadata.custom.insert(
911        META_LOGICAL_KEY.to_string(),
912        json!(format!("repository:{}", repo_name)),
913    );
914    block
915}
916
917fn make_directory_block(path: &str) -> Block {
918    let mut block = Block::new(
919        Content::json(json!({
920            "path": path,
921        })),
922        Some("custom.directory"),
923    );
924    block.metadata.label = Some(path.to_string());
925    block
926        .metadata
927        .custom
928        .insert(META_NODE_CLASS.to_string(), json!("directory"));
929    block
930        .metadata
931        .custom
932        .insert(META_PATH.to_string(), json!(path));
933    block.metadata.custom.insert(
934        META_LOGICAL_KEY.to_string(),
935        json!(format!("directory:{}", path)),
936    );
937    block
938}
939
940fn make_file_block(path: &str, language: &str) -> Block {
941    let mut block = Block::new(
942        Content::json(json!({
943            "path": path,
944            "language": language,
945        })),
946        Some("custom.file"),
947    );
948    block.metadata.label = Some(path.to_string());
949    block
950        .metadata
951        .custom
952        .insert(META_NODE_CLASS.to_string(), json!("file"));
953    block
954        .metadata
955        .custom
956        .insert(META_PATH.to_string(), json!(path));
957    block
958        .metadata
959        .custom
960        .insert(META_LANGUAGE.to_string(), json!(language));
961    block.metadata.custom.insert(
962        META_LOGICAL_KEY.to_string(),
963        json!(format!("file:{}", path)),
964    );
965    block
966}
967
968fn make_symbol_block(
969    logical_key: &str,
970    path: &str,
971    language: &str,
972    symbol: &ExtractedSymbol,
973) -> Block {
974    let span = json!({
975        "start_line": symbol.start_line,
976        "start_col": symbol.start_col,
977        "end_line": symbol.end_line,
978        "end_col": symbol.end_col,
979    });
980
981    let mut block = Block::new(
982        Content::json(json!({
983            "name": symbol.name,
984            "kind": symbol.kind,
985            "path": path,
986            "span": span,
987            "exported": symbol.exported,
988        })),
989        Some("custom.symbol"),
990    );
991
992    block.metadata.label = Some(symbol.name.clone());
993    block
994        .metadata
995        .custom
996        .insert(META_NODE_CLASS.to_string(), json!("symbol"));
997    block
998        .metadata
999        .custom
1000        .insert(META_LOGICAL_KEY.to_string(), json!(logical_key));
1001    block
1002        .metadata
1003        .custom
1004        .insert(META_PATH.to_string(), json!(path));
1005    block
1006        .metadata
1007        .custom
1008        .insert(META_LANGUAGE.to_string(), json!(language));
1009    block
1010        .metadata
1011        .custom
1012        .insert(META_SYMBOL_KIND.to_string(), json!(symbol.kind));
1013    block
1014        .metadata
1015        .custom
1016        .insert(META_SYMBOL_NAME.to_string(), json!(symbol.name));
1017    block.metadata.custom.insert(META_SPAN.to_string(), span);
1018    block
1019        .metadata
1020        .custom
1021        .insert(META_EXPORTED.to_string(), json!(symbol.exported));
1022    block
1023}
1024
1025fn analyze_file(path: &str, source: &str, language: CodeLanguage) -> FileAnalysis {
1026    let mut analysis = FileAnalysis::default();
1027    let mut parser = Parser::new();
1028    if parser.set_language(language_for(language)).is_err() {
1029        analysis.diagnostics.push(
1030            CodeGraphDiagnostic::error(
1031                "CG2010",
1032                format!(
1033                    "failed to initialize tree-sitter parser for {}",
1034                    language.as_str()
1035                ),
1036            )
1037            .with_path(path.to_string()),
1038        );
1039        return analysis;
1040    }
1041
1042    let Some(tree) = parser.parse(source, None) else {
1043        analysis.diagnostics.push(
1044            CodeGraphDiagnostic::error("CG2011", "tree-sitter returned no parse tree")
1045                .with_path(path.to_string()),
1046        );
1047        return analysis;
1048    };
1049
1050    let root = tree.root_node();
1051    if root.has_error() {
1052        analysis.diagnostics.push(
1053            CodeGraphDiagnostic::warning(
1054                "CG2002",
1055                "tree-sitter parser reported syntax errors; extraction continues",
1056            )
1057            .with_path(path.to_string()),
1058        );
1059    }
1060
1061    match language {
1062        CodeLanguage::Rust => analyze_rust_tree(source, root, &mut analysis),
1063        CodeLanguage::Python => analyze_python_tree(source, root, &mut analysis),
1064        CodeLanguage::TypeScript | CodeLanguage::JavaScript => {
1065            analyze_ts_tree(source, root, &mut analysis)
1066        }
1067    }
1068
1069    if analysis.symbols.is_empty() {
1070        analysis.diagnostics.push(
1071            CodeGraphDiagnostic::info(
1072                "CG2001",
1073                format!("no top-level symbols extracted for {}", path),
1074            )
1075            .with_path(path.to_string()),
1076        );
1077    }
1078
1079    analysis
1080}
1081
1082fn language_for(language: CodeLanguage) -> Language {
1083    match language {
1084        CodeLanguage::Rust => tree_sitter_rust::language(),
1085        CodeLanguage::Python => tree_sitter_python::language(),
1086        CodeLanguage::TypeScript => tree_sitter_typescript::language_typescript(),
1087        CodeLanguage::JavaScript => tree_sitter_javascript::language(),
1088    }
1089}
1090
1091fn analyze_rust_tree(source: &str, root: Node<'_>, analysis: &mut FileAnalysis) {
1092    let mut cursor = root.walk();
1093    for node in root.named_children(&mut cursor) {
1094        match node.kind() {
1095            "use_declaration" => {
1096                let import_text = node_text(source, node)
1097                    .trim()
1098                    .trim_start_matches("pub ")
1099                    .trim_start_matches("use ")
1100                    .trim_end_matches(';')
1101                    .trim()
1102                    .to_string();
1103                if !import_text.is_empty() {
1104                    analysis.imports.push(ExtractedImport {
1105                        module: import_text,
1106                    });
1107                }
1108            }
1109            "mod_item" => {
1110                let text = node_text(source, node);
1111                if text.trim().ends_with(';') {
1112                    if let Some(name) = rust_symbol_name(node, source) {
1113                        analysis.imports.push(ExtractedImport {
1114                            module: format!("mod:{}", name),
1115                        });
1116                    }
1117                }
1118                if let Some(symbol) = rust_symbol_from_node(node, source) {
1119                    analysis.symbols.push(symbol);
1120                }
1121            }
1122            "function_item" | "struct_item" | "enum_item" | "trait_item" | "impl_item"
1123            | "type_item" | "const_item" => {
1124                if let Some(symbol) = rust_symbol_from_node(node, source) {
1125                    analysis.symbols.push(symbol);
1126                }
1127            }
1128            _ => {}
1129        }
1130    }
1131}
1132
1133fn rust_symbol_from_node(node: Node<'_>, source: &str) -> Option<ExtractedSymbol> {
1134    let kind = match node.kind() {
1135        "function_item" => "function",
1136        "struct_item" => "struct",
1137        "enum_item" => "enum",
1138        "trait_item" => "trait",
1139        "impl_item" => "impl",
1140        "type_item" => "type",
1141        "const_item" => "const",
1142        "mod_item" => "module",
1143        _ => return None,
1144    };
1145
1146    let name = rust_symbol_name(node, source)?;
1147    let exported = node_text(source, node).trim_start().starts_with("pub");
1148    let (start_line, start_col, end_line, end_col) = node_span(node);
1149
1150    Some(ExtractedSymbol {
1151        name,
1152        kind: kind.to_string(),
1153        exported,
1154        start_line,
1155        start_col,
1156        end_line,
1157        end_col,
1158    })
1159}
1160
1161fn rust_symbol_name(node: Node<'_>, source: &str) -> Option<String> {
1162    if let Some(name_node) = node.child_by_field_name("name") {
1163        let name = node_text(source, name_node).trim().to_string();
1164        if !name.is_empty() {
1165            return Some(name);
1166        }
1167    }
1168
1169    if node.kind() == "impl_item" {
1170        if let Some(type_node) = node.child_by_field_name("type") {
1171            let name = node_text(source, type_node).trim().to_string();
1172            if !name.is_empty() {
1173                return Some(name);
1174            }
1175        }
1176    }
1177
1178    first_named_identifier(node, source)
1179}
1180
1181fn analyze_python_tree(source: &str, root: Node<'_>, analysis: &mut FileAnalysis) {
1182    let mut cursor = root.walk();
1183    for node in root.named_children(&mut cursor) {
1184        match node.kind() {
1185            "import_statement" => {
1186                let text = node_text(source, node).trim().to_string();
1187                if let Some(list) = text.strip_prefix("import ") {
1188                    for item in list.split(',') {
1189                        let name = item.split_whitespace().next().unwrap_or("").trim();
1190                        if !name.is_empty() {
1191                            analysis.imports.push(ExtractedImport {
1192                                module: name.to_string(),
1193                            });
1194                        }
1195                    }
1196                }
1197            }
1198            "import_from_statement" => {
1199                let text = node_text(source, node).trim().to_string();
1200                if let Some(rest) = text.strip_prefix("from ") {
1201                    if let Some((module, _)) = rest.split_once(" import ") {
1202                        let module_name = module.trim();
1203                        if !module_name.is_empty() {
1204                            analysis.imports.push(ExtractedImport {
1205                                module: module_name.to_string(),
1206                            });
1207                        }
1208                    }
1209                }
1210            }
1211            "function_definition" | "class_definition" => {
1212                let Some(name_node) = node.child_by_field_name("name") else {
1213                    continue;
1214                };
1215                let name = node_text(source, name_node).trim().to_string();
1216                if name.is_empty() {
1217                    continue;
1218                }
1219                let (start_line, start_col, end_line, end_col) = node_span(node);
1220                analysis.symbols.push(ExtractedSymbol {
1221                    name: name.clone(),
1222                    kind: if node.kind() == "class_definition" {
1223                        "class".to_string()
1224                    } else {
1225                        "function".to_string()
1226                    },
1227                    exported: !name.starts_with('_'),
1228                    start_line,
1229                    start_col,
1230                    end_line,
1231                    end_col,
1232                });
1233            }
1234            _ => {}
1235        }
1236    }
1237}
1238
1239fn analyze_ts_tree(source: &str, root: Node<'_>, analysis: &mut FileAnalysis) {
1240    let mut cursor = root.walk();
1241    for node in root.named_children(&mut cursor) {
1242        match node.kind() {
1243            "import_statement" => {
1244                if let Some(module) = extract_ts_module_from_text(node_text(source, node)) {
1245                    analysis.imports.push(ExtractedImport { module });
1246                }
1247            }
1248            "export_statement" => {
1249                if let Some(module) = extract_ts_module_from_text(node_text(source, node)) {
1250                    analysis.imports.push(ExtractedImport { module });
1251                }
1252                analysis
1253                    .symbols
1254                    .extend(ts_symbols_from_export_statement(node, source));
1255            }
1256            "function_declaration"
1257            | "class_declaration"
1258            | "interface_declaration"
1259            | "type_alias_declaration"
1260            | "enum_declaration"
1261            | "module" => {
1262                if let Some(symbol) = ts_symbol_from_declaration(node, source, false) {
1263                    analysis.symbols.push(symbol);
1264                }
1265            }
1266            "lexical_declaration" | "variable_statement" => {
1267                analysis
1268                    .symbols
1269                    .extend(ts_variable_symbols(node, source, false));
1270            }
1271            _ => {}
1272        }
1273    }
1274}
1275
1276fn ts_symbols_from_export_statement(node: Node<'_>, source: &str) -> Vec<ExtractedSymbol> {
1277    let mut out = Vec::new();
1278    let mut cursor = node.walk();
1279    for child in node.named_children(&mut cursor) {
1280        match child.kind() {
1281            "function_declaration"
1282            | "class_declaration"
1283            | "interface_declaration"
1284            | "type_alias_declaration"
1285            | "enum_declaration"
1286            | "module" => {
1287                if let Some(symbol) = ts_symbol_from_declaration(child, source, true) {
1288                    out.push(symbol);
1289                }
1290            }
1291            "lexical_declaration" | "variable_statement" => {
1292                out.extend(ts_variable_symbols(child, source, true));
1293            }
1294            _ => {}
1295        }
1296    }
1297    out
1298}
1299
1300fn ts_symbol_from_declaration(
1301    node: Node<'_>,
1302    source: &str,
1303    exported_hint: bool,
1304) -> Option<ExtractedSymbol> {
1305    let kind = match node.kind() {
1306        "function_declaration" => "function",
1307        "class_declaration" => "class",
1308        "interface_declaration" => "interface",
1309        "type_alias_declaration" => "type",
1310        "enum_declaration" => "enum",
1311        "module" => "namespace",
1312        _ => return None,
1313    };
1314
1315    let name = node
1316        .child_by_field_name("name")
1317        .map(|n| node_text(source, n).trim().to_string())
1318        .or_else(|| first_named_identifier(node, source))?;
1319    if name.is_empty() {
1320        return None;
1321    }
1322    let exported = exported_hint || node_text(source, node).trim_start().starts_with("export ");
1323    let (start_line, start_col, end_line, end_col) = node_span(node);
1324
1325    Some(ExtractedSymbol {
1326        name,
1327        kind: kind.to_string(),
1328        exported,
1329        start_line,
1330        start_col,
1331        end_line,
1332        end_col,
1333    })
1334}
1335
1336fn ts_variable_symbols(node: Node<'_>, source: &str, exported_hint: bool) -> Vec<ExtractedSymbol> {
1337    let mut out = Vec::new();
1338    let exported = exported_hint || node_text(source, node).trim_start().starts_with("export ");
1339
1340    let mut stack = vec![node];
1341    while let Some(current) = stack.pop() {
1342        if current.kind() == "variable_declarator" {
1343            if let Some(name_node) = current.child_by_field_name("name") {
1344                let name = node_text(source, name_node).trim().to_string();
1345                if !name.is_empty() {
1346                    let (start_line, start_col, end_line, end_col) = node_span(current);
1347                    out.push(ExtractedSymbol {
1348                        name,
1349                        kind: "variable".to_string(),
1350                        exported,
1351                        start_line,
1352                        start_col,
1353                        end_line,
1354                        end_col,
1355                    });
1356                }
1357            }
1358            continue;
1359        }
1360
1361        let mut cursor = current.walk();
1362        for child in current.named_children(&mut cursor) {
1363            stack.push(child);
1364        }
1365    }
1366
1367    out
1368}
1369
1370fn extract_ts_module_from_text(text: &str) -> Option<String> {
1371    let patterns = [
1372        Regex::new(r#"(?i)\bfrom\s+['"]([^'"]+)['"]"#).ok()?,
1373        Regex::new(r#"(?i)\bimport\s+['"]([^'"]+)['"]"#).ok()?,
1374        Regex::new(r#"(?i)require\(\s*['"]([^'"]+)['"]\s*\)"#).ok()?,
1375    ];
1376    for pattern in patterns {
1377        if let Some(caps) = pattern.captures(text) {
1378            if let Some(module) = caps.get(1).map(|m| m.as_str().trim()) {
1379                if !module.is_empty() {
1380                    return Some(module.to_string());
1381                }
1382            }
1383        }
1384    }
1385    None
1386}
1387
1388fn node_text<'a>(source: &'a str, node: Node<'_>) -> &'a str {
1389    let start = node.start_byte().min(source.len());
1390    let end = node.end_byte().min(source.len());
1391    &source[start..end]
1392}
1393
1394fn node_span(node: Node<'_>) -> (usize, usize, usize, usize) {
1395    let start = node.start_position();
1396    let end = node.end_position();
1397    (start.row + 1, start.column + 1, end.row + 1, end.column + 1)
1398}
1399
1400fn first_named_identifier(node: Node<'_>, source: &str) -> Option<String> {
1401    let mut stack = vec![node];
1402    while let Some(current) = stack.pop() {
1403        if matches!(current.kind(), "identifier" | "type_identifier") {
1404            let text = node_text(source, current).trim().to_string();
1405            if !text.is_empty() {
1406                return Some(text);
1407            }
1408        }
1409
1410        let mut cursor = current.walk();
1411        for child in current.named_children(&mut cursor) {
1412            stack.push(child);
1413        }
1414    }
1415    None
1416}
1417
1418fn resolve_import(
1419    source_file: &str,
1420    language: &CodeLanguage,
1421    module: &str,
1422    known_files: &BTreeSet<String>,
1423) -> Option<String> {
1424    match language {
1425        CodeLanguage::Rust => resolve_rust_import(source_file, module, known_files),
1426        CodeLanguage::Python => resolve_python_import(source_file, module, known_files),
1427        CodeLanguage::TypeScript | CodeLanguage::JavaScript => {
1428            resolve_ts_import(source_file, module, known_files)
1429        }
1430    }
1431}
1432
1433fn resolve_ts_import(
1434    source_file: &str,
1435    module: &str,
1436    known_files: &BTreeSet<String>,
1437) -> Option<String> {
1438    if !module.starts_with('.') {
1439        return None;
1440    }
1441
1442    let source_dir = parent_directory(source_file);
1443    let joined = normalize_relative_join(&source_dir, module);
1444
1445    ts_candidates(&joined)
1446        .into_iter()
1447        .find(|candidate| known_files.contains(candidate))
1448}
1449
1450fn ts_candidates(base: &str) -> Vec<String> {
1451    let exts = ["ts", "tsx", "js", "jsx"];
1452    let mut out = Vec::new();
1453
1454    if has_known_extension(base, &exts) {
1455        out.push(base.to_string());
1456    } else {
1457        for ext in exts {
1458            out.push(format!("{}.{}", base, ext));
1459        }
1460        for ext in exts {
1461            out.push(format!("{}/index.{}", base, ext));
1462        }
1463    }
1464
1465    out
1466}
1467
1468fn resolve_python_import(
1469    source_file: &str,
1470    module: &str,
1471    known_files: &BTreeSet<String>,
1472) -> Option<String> {
1473    let source_dir = parent_directory(source_file);
1474    let mut dots = 0usize;
1475    for ch in module.chars() {
1476        if ch == '.' {
1477            dots += 1;
1478        } else {
1479            break;
1480        }
1481    }
1482
1483    let module_tail = module.trim_start_matches('.');
1484
1485    let base_dir = if dots > 0 {
1486        ascend_directory(&source_dir, dots.saturating_sub(1))
1487    } else {
1488        String::new()
1489    };
1490
1491    let module_path = module_tail.replace('.', "/");
1492
1493    let joined = if base_dir.is_empty() {
1494        module_path
1495    } else if module_path.is_empty() {
1496        base_dir
1497    } else {
1498        format!("{}/{}", base_dir, module_path)
1499    };
1500
1501    py_candidates(&joined)
1502        .into_iter()
1503        .find(|candidate| known_files.contains(candidate))
1504}
1505
1506fn py_candidates(base: &str) -> Vec<String> {
1507    if base.is_empty() {
1508        return Vec::new();
1509    }
1510
1511    if base.ends_with(".py") {
1512        return vec![base.to_string()];
1513    }
1514
1515    vec![format!("{}.py", base), format!("{}/__init__.py", base)]
1516}
1517
1518fn resolve_rust_import(
1519    source_file: &str,
1520    module: &str,
1521    known_files: &BTreeSet<String>,
1522) -> Option<String> {
1523    if module.starts_with("std::") || module.starts_with("core::") || module.starts_with("alloc::")
1524    {
1525        return None;
1526    }
1527
1528    if let Some(name) = module.strip_prefix("mod:") {
1529        let source_dir = parent_directory(source_file);
1530        let local = normalize_relative_join(&source_dir, name);
1531        for candidate in [format!("{}.rs", local), format!("{}/mod.rs", local)] {
1532            if known_files.contains(&candidate) {
1533                return Some(candidate);
1534            }
1535        }
1536        return None;
1537    }
1538
1539    let source_dir = parent_directory(source_file);
1540
1541    let (base_dir, path_segments) = if let Some(rest) = module.strip_prefix("crate::") {
1542        (
1543            "src".to_string(),
1544            rest.split("::").map(|s| s.to_string()).collect::<Vec<_>>(),
1545        )
1546    } else if let Some(rest) = module.strip_prefix("self::") {
1547        (
1548            source_dir.clone(),
1549            rest.split("::").map(|s| s.to_string()).collect::<Vec<_>>(),
1550        )
1551    } else if module.starts_with("super::") {
1552        let mut rest = module;
1553        let mut super_count = 0usize;
1554        while let Some(next) = rest.strip_prefix("super::") {
1555            super_count += 1;
1556            rest = next;
1557        }
1558        (
1559            ascend_directory(&source_dir, super_count),
1560            rest.split("::").map(|s| s.to_string()).collect::<Vec<_>>(),
1561        )
1562    } else {
1563        (
1564            "src".to_string(),
1565            module
1566                .split("::")
1567                .map(|s| s.to_string())
1568                .collect::<Vec<_>>(),
1569        )
1570    };
1571
1572    for trimmed in (1..=path_segments.len()).rev() {
1573        let joined = path_segments[..trimmed].join("/");
1574        if joined.is_empty() {
1575            continue;
1576        }
1577        let candidate_base = if base_dir.is_empty() {
1578            joined
1579        } else {
1580            format!("{}/{}", base_dir, joined)
1581        };
1582
1583        for candidate in [
1584            format!("{}.rs", candidate_base),
1585            format!("{}/mod.rs", candidate_base),
1586        ] {
1587            if known_files.contains(&candidate) {
1588                return Some(candidate);
1589            }
1590        }
1591    }
1592
1593    None
1594}
1595
1596fn has_known_extension(path: &str, exts: &[&str]) -> bool {
1597    exts.iter().any(|ext| path.ends_with(&format!(".{}", ext)))
1598}
1599
1600fn normalize_temporal_fields(doc: &mut Document) {
1601    let ts = deterministic_timestamp();
1602    doc.metadata.created_at = ts;
1603    doc.metadata.modified_at = ts;
1604    doc.version.timestamp = ts;
1605
1606    for block in doc.blocks.values_mut() {
1607        block.metadata.created_at = ts;
1608        block.metadata.modified_at = ts;
1609        block.version.timestamp = ts;
1610
1611        for edge in &mut block.edges {
1612            edge.created_at = ts;
1613        }
1614    }
1615}
1616
1617fn deterministic_timestamp() -> chrono::DateTime<chrono::Utc> {
1618    chrono::DateTime::parse_from_rfc3339("1970-01-01T00:00:00Z")
1619        .unwrap()
1620        .with_timezone(&chrono::Utc)
1621}
1622
1623fn sort_structure_children_by_logical_key(doc: &mut Document) {
1624    let key_index = logical_key_index(doc);
1625
1626    for children in doc.structure.values_mut() {
1627        children.sort_by(|a, b| {
1628            let ka = key_index.get(a).cloned().unwrap_or_else(|| a.to_string());
1629            let kb = key_index.get(b).cloned().unwrap_or_else(|| b.to_string());
1630            ka.cmp(&kb)
1631        });
1632    }
1633}
1634
1635fn sort_edges(doc: &mut Document) {
1636    let key_index = logical_key_index(doc);
1637
1638    for block in doc.blocks.values_mut() {
1639        block.edges.sort_by(|a, b| {
1640            let at = key_index
1641                .get(&a.target)
1642                .cloned()
1643                .unwrap_or_else(|| a.target.to_string());
1644            let bt = key_index
1645                .get(&b.target)
1646                .cloned()
1647                .unwrap_or_else(|| b.target.to_string());
1648
1649            a.edge_type
1650                .as_str()
1651                .cmp(&b.edge_type.as_str())
1652                .then_with(|| at.cmp(&bt))
1653        });
1654    }
1655}
1656
1657fn compute_stats(doc: &Document) -> CodeGraphStats {
1658    let mut stats = CodeGraphStats::default();
1659
1660    for (id, block) in &doc.blocks {
1661        if *id == doc.root {
1662            continue;
1663        }
1664
1665        stats.total_nodes += 1;
1666
1667        match node_class(block).as_deref() {
1668            Some("repository") => stats.repository_nodes += 1,
1669            Some("directory") => stats.directory_nodes += 1,
1670            Some("file") => {
1671                stats.file_nodes += 1;
1672                if let Some(lang) = block
1673                    .metadata
1674                    .custom
1675                    .get(META_LANGUAGE)
1676                    .and_then(|v| v.as_str())
1677                {
1678                    *stats.languages.entry(lang.to_string()).or_default() += 1;
1679                }
1680            }
1681            Some("symbol") => stats.symbol_nodes += 1,
1682            _ => {}
1683        }
1684
1685        for edge in &block.edges {
1686            stats.total_edges += 1;
1687            match &edge.edge_type {
1688                EdgeType::References => stats.reference_edges += 1,
1689                EdgeType::Custom(name) if name == "exports" => stats.export_edges += 1,
1690                _ => {}
1691            }
1692        }
1693    }
1694
1695    stats
1696}
1697
1698fn block_logical_key(block: &Block) -> Option<String> {
1699    block
1700        .metadata
1701        .custom
1702        .get(META_LOGICAL_KEY)
1703        .and_then(|v| v.as_str())
1704        .map(|s| s.to_string())
1705}
1706
1707fn block_path(block: &Block) -> Option<String> {
1708    block
1709        .metadata
1710        .custom
1711        .get(META_PATH)
1712        .and_then(|v| v.as_str())
1713        .map(|s| s.to_string())
1714}
1715
1716fn node_class(block: &Block) -> Option<String> {
1717    if let Some(class) = block
1718        .metadata
1719        .custom
1720        .get(META_NODE_CLASS)
1721        .and_then(|v| v.as_str())
1722    {
1723        return Some(class.to_string());
1724    }
1725
1726    if let Some(role) = &block.metadata.semantic_role {
1727        if role.category == ucm_core::RoleCategory::Custom {
1728            if let Some(sub) = &role.subcategory {
1729                return Some(sub.to_string());
1730            }
1731        }
1732    }
1733
1734    None
1735}
1736
1737fn validate_required_metadata(
1738    class_name: &str,
1739    block: &Block,
1740    diagnostics: &mut Vec<CodeGraphDiagnostic>,
1741) {
1742    let required = match class_name {
1743        "repository" => vec![META_LOGICAL_KEY],
1744        "directory" => vec![META_LOGICAL_KEY, META_PATH],
1745        "file" => vec![META_LOGICAL_KEY, META_PATH, META_LANGUAGE],
1746        "symbol" => vec![
1747            META_LOGICAL_KEY,
1748            META_PATH,
1749            META_LANGUAGE,
1750            META_SYMBOL_KIND,
1751            META_SYMBOL_NAME,
1752            META_SPAN,
1753            META_EXPORTED,
1754        ],
1755        _ => {
1756            diagnostics.push(CodeGraphDiagnostic::error(
1757                "CG1017",
1758                format!("invalid node_class '{}'", class_name),
1759            ));
1760            return;
1761        }
1762    };
1763
1764    for key in required {
1765        if !block.metadata.custom.contains_key(key) {
1766            diagnostics.push(
1767                CodeGraphDiagnostic::error(
1768                    "CG1018",
1769                    format!(
1770                        "node class '{}' missing required metadata key '{}'",
1771                        class_name, key
1772                    ),
1773                )
1774                .with_logical_key(block_logical_key(block).unwrap_or_else(|| block.id.to_string())),
1775            );
1776        }
1777    }
1778
1779    if let Some(logical_key) = block_logical_key(block) {
1780        let expected_prefix = match class_name {
1781            "repository" => "repository:",
1782            "directory" => "directory:",
1783            "file" => "file:",
1784            "symbol" => "symbol:",
1785            _ => "",
1786        };
1787
1788        if !expected_prefix.is_empty() && !logical_key.starts_with(expected_prefix) {
1789            diagnostics.push(
1790                CodeGraphDiagnostic::error(
1791                    "CG1019",
1792                    format!(
1793                        "logical_key '{}' must start with '{}'",
1794                        logical_key, expected_prefix
1795                    ),
1796                )
1797                .with_logical_key(logical_key),
1798            );
1799        }
1800    }
1801}
1802
1803fn logical_key_index(doc: &Document) -> HashMap<BlockId, String> {
1804    doc.blocks
1805        .iter()
1806        .map(|(id, block)| {
1807            (
1808                *id,
1809                block_logical_key(block).unwrap_or_else(|| id.to_string()),
1810            )
1811        })
1812        .collect()
1813}
1814
1815fn normalized_document_metadata(doc: &Document) -> serde_json::Value {
1816    let mut custom = serde_json::Map::new();
1817    let mut custom_entries: Vec<_> = doc.metadata.custom.iter().collect();
1818    custom_entries.sort_by(|a, b| a.0.cmp(b.0));
1819    for (k, v) in custom_entries {
1820        if is_volatile_metadata_key(k) {
1821            continue;
1822        }
1823        custom.insert(k.clone(), v.clone());
1824    }
1825
1826    json!({
1827        "title": doc.metadata.title,
1828        "description": doc.metadata.description,
1829        "authors": doc.metadata.authors,
1830        "language": doc.metadata.language,
1831        "custom": custom,
1832    })
1833}
1834
1835fn normalized_block_metadata(block: &Block) -> serde_json::Value {
1836    let mut custom = serde_json::Map::new();
1837    let mut entries: Vec<_> = block.metadata.custom.iter().collect();
1838    entries.sort_by(|a, b| a.0.cmp(b.0));
1839    for (k, v) in entries {
1840        if is_volatile_metadata_key(k) {
1841            continue;
1842        }
1843        custom.insert(k.clone(), v.clone());
1844    }
1845
1846    json!({
1847        "label": block.metadata.label,
1848        "semantic_role": block.metadata.semantic_role.as_ref().map(|r| r.to_string()),
1849        "tags": block.metadata.tags,
1850        "summary": block.metadata.summary,
1851        "custom": custom,
1852    })
1853}
1854
1855fn normalized_edge_metadata(edge: &Edge) -> serde_json::Value {
1856    let mut custom = serde_json::Map::new();
1857    let mut entries: Vec<_> = edge.metadata.custom.iter().collect();
1858    entries.sort_by(|a, b| a.0.cmp(b.0));
1859    for (k, v) in entries {
1860        if is_volatile_metadata_key(k) {
1861            continue;
1862        }
1863        custom.insert(k.clone(), v.clone());
1864    }
1865
1866    json!({
1867        "confidence": edge.metadata.confidence,
1868        "description": edge.metadata.description,
1869        "custom": custom,
1870    })
1871}
1872
1873fn is_volatile_metadata_key(key: &str) -> bool {
1874    matches!(key, "generated_at" | "runtime" | "session" | "timestamp")
1875}
1876
1877fn collect_repository_files(
1878    root: &Path,
1879    config: &CodeGraphExtractorConfig,
1880    matcher: &GitignoreMatcher,
1881    diagnostics: &mut Vec<CodeGraphDiagnostic>,
1882) -> Result<Vec<RepoFile>> {
1883    let include_exts: HashSet<String> = config
1884        .include_extensions
1885        .iter()
1886        .map(|ext| ext.trim_start_matches('.').to_ascii_lowercase())
1887        .collect();
1888
1889    let exclude_dirs: HashSet<String> = config.exclude_dirs.iter().cloned().collect();
1890
1891    let mut out = Vec::new();
1892    collect_repository_files_recursive(
1893        root,
1894        root,
1895        &include_exts,
1896        &exclude_dirs,
1897        config,
1898        matcher,
1899        diagnostics,
1900        &mut out,
1901    )?;
1902
1903    out.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
1904    Ok(out)
1905}
1906
1907#[allow(clippy::too_many_arguments)]
1908fn collect_repository_files_recursive(
1909    root: &Path,
1910    current: &Path,
1911    include_exts: &HashSet<String>,
1912    exclude_dirs: &HashSet<String>,
1913    config: &CodeGraphExtractorConfig,
1914    matcher: &GitignoreMatcher,
1915    diagnostics: &mut Vec<CodeGraphDiagnostic>,
1916    out: &mut Vec<RepoFile>,
1917) -> Result<()> {
1918    let read_dir = match fs::read_dir(current) {
1919        Ok(rd) => rd,
1920        Err(err) => {
1921            diagnostics.push(CodeGraphDiagnostic::warning(
1922                "CG2004",
1923                format!("failed to read directory {}: {}", current.display(), err),
1924            ));
1925            return Ok(());
1926        }
1927    };
1928
1929    let mut entries = Vec::new();
1930    for entry in read_dir {
1931        match entry {
1932            Ok(e) => entries.push(e),
1933            Err(err) => diagnostics.push(CodeGraphDiagnostic::warning(
1934                "CG2005",
1935                format!("failed to access directory entry: {}", err),
1936            )),
1937        }
1938    }
1939
1940    entries.sort_by_key(|entry| entry.file_name());
1941
1942    for entry in entries {
1943        let path = entry.path();
1944        let rel = normalize_path(
1945            path.strip_prefix(root)
1946                .with_context(|| format!("failed to strip prefix {}", root.display()))?,
1947        );
1948
1949        if rel.is_empty() {
1950            continue;
1951        }
1952
1953        let file_type = match entry.file_type() {
1954            Ok(ft) => ft,
1955            Err(err) => {
1956                diagnostics.push(CodeGraphDiagnostic::warning(
1957                    "CG2005",
1958                    format!("failed to read file type for {}: {}", rel, err),
1959                ));
1960                continue;
1961            }
1962        };
1963
1964        if !config.include_hidden && is_hidden_path(&rel) {
1965            continue;
1966        }
1967
1968        if file_type.is_dir() {
1969            let dir_name = path
1970                .file_name()
1971                .map(|n| n.to_string_lossy().to_string())
1972                .unwrap_or_default();
1973
1974            if exclude_dirs.contains(&dir_name) || matcher.is_ignored(&rel, true) {
1975                continue;
1976            }
1977
1978            collect_repository_files_recursive(
1979                root,
1980                &path,
1981                include_exts,
1982                exclude_dirs,
1983                config,
1984                matcher,
1985                diagnostics,
1986                out,
1987            )?;
1988            continue;
1989        }
1990
1991        if !file_type.is_file() {
1992            continue;
1993        }
1994
1995        if matcher.is_ignored(&rel, false) {
1996            continue;
1997        }
1998
1999        let ext = path
2000            .extension()
2001            .and_then(|e| e.to_str())
2002            .map(|e| e.to_ascii_lowercase())
2003            .unwrap_or_default();
2004
2005        if !include_exts.contains(&ext) {
2006            continue;
2007        }
2008
2009        if let Some(language) = extension_language(&ext) {
2010            out.push(RepoFile {
2011                absolute_path: path,
2012                relative_path: rel,
2013                language,
2014            });
2015        } else {
2016            diagnostics.push(
2017                CodeGraphDiagnostic::info("CG2007", format!("unsupported extension '.{}'", ext))
2018                    .with_path(rel),
2019            );
2020        }
2021    }
2022
2023    Ok(())
2024}
2025
2026fn extension_language(ext: &str) -> Option<CodeLanguage> {
2027    match ext {
2028        "rs" => Some(CodeLanguage::Rust),
2029        "py" => Some(CodeLanguage::Python),
2030        "ts" | "tsx" => Some(CodeLanguage::TypeScript),
2031        "js" | "jsx" => Some(CodeLanguage::JavaScript),
2032        _ => None,
2033    }
2034}
2035
2036fn unique_symbol_logical_key(
2037    file_path: &str,
2038    symbol_name: &str,
2039    line: usize,
2040    used: &mut HashSet<String>,
2041) -> String {
2042    let base = format!("symbol:{}::{}", file_path, symbol_name);
2043    if used.insert(base.clone()) {
2044        return base;
2045    }
2046
2047    let with_line = format!("{}#{}", base, line);
2048    if used.insert(with_line.clone()) {
2049        return with_line;
2050    }
2051
2052    let mut n = 2usize;
2053    loop {
2054        let candidate = format!("{}#{}", with_line, n);
2055        if used.insert(candidate.clone()) {
2056            return candidate;
2057        }
2058        n += 1;
2059    }
2060}
2061
2062fn ancestor_directories(path: &str) -> Vec<String> {
2063    let parts: Vec<&str> = path.split('/').collect();
2064    if parts.len() <= 1 {
2065        return Vec::new();
2066    }
2067
2068    let mut dirs = Vec::new();
2069    for i in 1..parts.len() {
2070        let dir = parts[..i].join("/");
2071        if !dir.is_empty() {
2072            dirs.push(dir);
2073        }
2074    }
2075    dirs
2076}
2077
2078fn parent_directory_id(dir: &str, directory_ids: &BTreeMap<String, BlockId>) -> Option<BlockId> {
2079    let parent = parent_directory(dir);
2080    if parent.is_empty() {
2081        None
2082    } else {
2083        directory_ids.get(&parent).copied()
2084    }
2085}
2086
2087fn parent_id_for_file(
2088    path: &str,
2089    repo_id: BlockId,
2090    directory_ids: &BTreeMap<String, BlockId>,
2091) -> BlockId {
2092    let parent_dir = parent_directory(path);
2093    if parent_dir.is_empty() {
2094        repo_id
2095    } else {
2096        directory_ids.get(&parent_dir).copied().unwrap_or(repo_id)
2097    }
2098}
2099
2100fn parent_directory(path: &str) -> String {
2101    match path.rsplit_once('/') {
2102        Some((parent, _)) => parent.to_string(),
2103        None => String::new(),
2104    }
2105}
2106
2107fn normalize_relative_join(base: &str, relative: &str) -> String {
2108    let mut segments = Vec::new();
2109
2110    if !base.is_empty() {
2111        segments.extend(
2112            base.split('/')
2113                .filter(|s| !s.is_empty())
2114                .map(|s| s.to_string()),
2115        );
2116    }
2117
2118    for part in relative.split('/') {
2119        match part {
2120            "" | "." => {}
2121            ".." => {
2122                segments.pop();
2123            }
2124            other => segments.push(other.to_string()),
2125        }
2126    }
2127
2128    segments.join("/")
2129}
2130
2131fn ascend_directory(path: &str, levels: usize) -> String {
2132    let mut parts: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
2133    for _ in 0..levels {
2134        if parts.is_empty() {
2135            break;
2136        }
2137        parts.pop();
2138    }
2139    parts.join("/")
2140}
2141
2142fn sanitize_identifier(raw: &str) -> String {
2143    raw.chars()
2144        .map(|c| {
2145            if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
2146                c
2147            } else {
2148                '_'
2149            }
2150        })
2151        .collect()
2152}
2153
2154fn normalize_path(path: &Path) -> String {
2155    path.components()
2156        .filter_map(|component| {
2157            let s = component.as_os_str().to_string_lossy();
2158            if s == "." || s.is_empty() {
2159                None
2160            } else {
2161                Some(s.to_string())
2162            }
2163        })
2164        .collect::<Vec<_>>()
2165        .join("/")
2166}
2167
2168fn is_hidden_path(path: &str) -> bool {
2169    path.split('/').any(|part| part.starts_with('.'))
2170}
2171
2172#[derive(Debug, Clone)]
2173struct GitignoreMatcher {
2174    rules: Vec<GitignoreRule>,
2175}
2176
2177#[derive(Debug, Clone)]
2178struct GitignoreRule {
2179    regex: Regex,
2180    directory_only: bool,
2181}
2182
2183impl GitignoreMatcher {
2184    fn from_repository(repo_root: &Path) -> Result<Self> {
2185        let gitignore_path = repo_root.join(".gitignore");
2186        if !gitignore_path.exists() {
2187            return Ok(Self { rules: Vec::new() });
2188        }
2189
2190        let raw = fs::read_to_string(&gitignore_path)
2191            .with_context(|| format!("failed to read {}", gitignore_path.display()))?;
2192
2193        let mut rules = Vec::new();
2194        for line in raw.lines() {
2195            let trimmed = line.trim();
2196            if trimmed.is_empty() || trimmed.starts_with('#') || trimmed.starts_with('!') {
2197                continue;
2198            }
2199
2200            if let Some(rule) = GitignoreRule::from_pattern(trimmed) {
2201                rules.push(rule);
2202            }
2203        }
2204
2205        Ok(Self { rules })
2206    }
2207
2208    fn is_ignored(&self, rel_path: &str, is_dir: bool) -> bool {
2209        for rule in &self.rules {
2210            if rule.directory_only && !is_dir {
2211                continue;
2212            }
2213            if rule.regex.is_match(rel_path) {
2214                return true;
2215            }
2216        }
2217        false
2218    }
2219}
2220
2221impl GitignoreRule {
2222    fn from_pattern(pattern: &str) -> Option<Self> {
2223        let directory_only = pattern.ends_with('/');
2224        let mut core = pattern.trim_end_matches('/').trim_start_matches("./");
2225
2226        if core.is_empty() {
2227            return None;
2228        }
2229
2230        let anchored = core.starts_with('/');
2231        core = core.trim_start_matches('/');
2232
2233        let mut regex = String::new();
2234        if anchored {
2235            regex.push('^');
2236        } else {
2237            regex.push_str("(^|.*/)");
2238        }
2239
2240        regex.push_str(&glob_to_regex(core));
2241
2242        if directory_only {
2243            regex.push_str("($|/.*)");
2244        } else {
2245            regex.push('$');
2246        }
2247
2248        let compiled = Regex::new(&regex).ok()?;
2249
2250        Some(Self {
2251            regex: compiled,
2252            directory_only,
2253        })
2254    }
2255}
2256
2257fn glob_to_regex(glob: &str) -> String {
2258    let mut out = String::new();
2259    let mut chars = glob.chars().peekable();
2260
2261    while let Some(ch) = chars.next() {
2262        match ch {
2263            '*' => {
2264                if matches!(chars.peek(), Some('*')) {
2265                    chars.next();
2266                    out.push_str(".*");
2267                } else {
2268                    out.push_str("[^/]*");
2269                }
2270            }
2271            '?' => out.push_str("[^/]"),
2272            '.' | '+' | '(' | ')' | '|' | '^' | '$' | '{' | '}' | '[' | ']' | '\\' => {
2273                out.push('\\');
2274                out.push(ch);
2275            }
2276            _ => out.push(ch),
2277        }
2278    }
2279
2280    out
2281}
2282
2283#[cfg(test)]
2284mod tests {
2285    use super::*;
2286    use std::io::Write;
2287    use tempfile::tempdir;
2288
2289    #[test]
2290    fn test_validate_profile_detects_missing_markers() {
2291        let doc = Document::create();
2292        let result = validate_code_graph_profile(&doc);
2293        assert!(!result.valid);
2294        assert!(result
2295            .diagnostics
2296            .iter()
2297            .any(|d| d.code == "CG1001" || d.code == "CG1002"));
2298    }
2299
2300    #[test]
2301    fn test_canonical_fingerprint_stable_for_equivalent_docs() {
2302        let dir = tempdir().unwrap();
2303        let root = dir.path();
2304        fs::create_dir_all(root.join("src")).unwrap();
2305        fs::write(root.join("src/lib.rs"), "pub fn a() {}\n").unwrap();
2306
2307        let input = CodeGraphBuildInput {
2308            repository_path: root.to_path_buf(),
2309            commit_hash: "abc123".to_string(),
2310            config: CodeGraphExtractorConfig::default(),
2311        };
2312
2313        let first = build_code_graph(&input).unwrap();
2314        let second = build_code_graph(&input).unwrap();
2315
2316        assert_eq!(first.canonical_fingerprint, second.canonical_fingerprint);
2317        assert_eq!(
2318            canonical_codegraph_json(&first.document).unwrap(),
2319            canonical_codegraph_json(&second.document).unwrap()
2320        );
2321    }
2322
2323    #[test]
2324    fn test_portable_document_roundtrip_preserves_fingerprint() {
2325        let dir = tempdir().unwrap();
2326        fs::create_dir_all(dir.path().join("pkg")).unwrap();
2327        fs::write(
2328            dir.path().join("pkg/main.py"),
2329            "from .util import helper\n\ndef run():\n    return helper()\n",
2330        )
2331        .unwrap();
2332        fs::write(
2333            dir.path().join("pkg/util.py"),
2334            "def helper():\n    return 1\n",
2335        )
2336        .unwrap();
2337
2338        let build = build_code_graph(&CodeGraphBuildInput {
2339            repository_path: dir.path().to_path_buf(),
2340            commit_hash: "def456".to_string(),
2341            config: CodeGraphExtractorConfig::default(),
2342        })
2343        .unwrap();
2344
2345        let portable = PortableDocument::from_document(&build.document);
2346        let json = serde_json::to_string_pretty(&portable).unwrap();
2347        let decoded: PortableDocument = serde_json::from_str(&json).unwrap();
2348        let roundtripped = decoded.to_document().unwrap();
2349
2350        let fp1 = canonical_fingerprint(&build.document).unwrap();
2351        let fp2 = canonical_fingerprint(&roundtripped).unwrap();
2352        assert_eq!(fp1, fp2);
2353    }
2354
2355    #[test]
2356    fn test_unresolved_import_produces_diagnostic() {
2357        let dir = tempdir().unwrap();
2358        fs::create_dir_all(dir.path().join("src")).unwrap();
2359        fs::write(
2360            dir.path().join("src/lib.rs"),
2361            "use crate::missing::thing;\npub fn keep() {}\n",
2362        )
2363        .unwrap();
2364
2365        let build = build_code_graph(&CodeGraphBuildInput {
2366            repository_path: dir.path().to_path_buf(),
2367            commit_hash: "ghi789".to_string(),
2368            config: CodeGraphExtractorConfig::default(),
2369        })
2370        .unwrap();
2371
2372        assert!(build
2373            .diagnostics
2374            .iter()
2375            .any(|d| d.code == "CG2006" && d.severity == CodeGraphSeverity::Warning));
2376    }
2377
2378    #[test]
2379    fn test_gitignore_rule_matches() {
2380        let rule = GitignoreRule::from_pattern("target/").unwrap();
2381        assert!(rule.regex.is_match("target"));
2382        assert!(rule.regex.is_match("target/debug/app"));
2383    }
2384
2385    #[test]
2386    fn test_import_resolution_ts_relative() {
2387        let mut known = BTreeSet::new();
2388        known.insert("src/main.ts".to_string());
2389        known.insert("src/util.ts".to_string());
2390
2391        let resolved = resolve_ts_import("src/main.ts", "./util", &known);
2392        assert_eq!(resolved.as_deref(), Some("src/util.ts"));
2393    }
2394
2395    #[test]
2396    fn test_performance_smoke_medium_fixture() {
2397        let dir = tempdir().unwrap();
2398        let src = dir.path().join("src");
2399        fs::create_dir_all(&src).unwrap();
2400
2401        for i in 0..300usize {
2402            let mut file = fs::File::create(src.join(format!("m{}.rs", i))).unwrap();
2403            writeln!(file, "pub fn f{}() {{}}", i).unwrap();
2404            if i > 0 {
2405                writeln!(file, "use crate::m{}::f{};", i - 1, i - 1).unwrap();
2406            }
2407        }
2408
2409        let start = std::time::Instant::now();
2410        let build = build_code_graph(&CodeGraphBuildInput {
2411            repository_path: dir.path().to_path_buf(),
2412            commit_hash: "perf-smoke".to_string(),
2413            config: CodeGraphExtractorConfig::default(),
2414        })
2415        .unwrap();
2416        let elapsed = start.elapsed();
2417
2418        assert!(build.stats.file_nodes >= 300);
2419        assert!(elapsed.as_secs_f64() < 3.0, "elapsed: {elapsed:?}");
2420    }
2421}