Skip to main content

trusty_contracts/
lib.rs

1//! Shared entity types for the trusty-* toolchain.
2//!
3//! Why: the knowledge-graph layer (`trusty-search-core::symbol_graph`), the
4//! analysis sidecar (`trusty-analyzer-core`), and ingest pipelines (SCIP, NER,
5//! concept-cluster) all consume the same `EntityType` / `EdgeKind` / `RawEntity`
6//! shapes. Extracting them into a tree-sitter–free crate lets analyzer crates
7//! depend on the shapes without inheriting 16 language grammars.
8//!
9//! What: pure data definitions — enums, structs, and the `fact_hash_str`
10//! helper. No async, no tokio, no tree-sitter. The tree-sitter–based
11//! extraction code (`extract_entities`, `EntityExtractor`, language-specific
12//! walkers) stays in `trusty-search-core::entity`.
13//!
14//! Test: see `#[cfg(test)]` in this file — covers `RawEntity::new` id
15//! stability, `EdgeKind::score_multiplier`, and `fact_hash_str` determinism.
16
17use serde::{Deserialize, Serialize};
18use sha2::{Digest, Sha256};
19
20/// Taxonomy of program entities surfaced from source code.
21#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
22pub enum EntityType {
23    /// Type identifiers (`Arc`, `Vec`, `CodeChunk`, …).
24    NamedType,
25    /// Trait bound expressions (`Send + Sync`, `Serialize`, …).
26    TraitBound,
27    /// Module paths (`crate::indexer::CodeIndexer`, `std::sync::Arc`).
28    ModulePath,
29    /// Error/panic call sites: `bail!`, `anyhow!`, `panic!`, `unwrap`.
30    ErrorVariant,
31    /// Identifiers referenced from `#[test]` function bodies.
32    TestRelation,
33    /// Doc-comment derived concept (NLP phrase / keyword).
34    DocConcept,
35    /// Attribute annotations (`#[derive(...)]`, `#[cfg(...)]`).
36    Annotation,
37    /// String literals longer than 10 characters.
38    LiteralString,
39    /// `type Foo = Bar` aliases.
40    TypeAlias,
41    /// Top-level `const`/`static` symbol.
42    ConstantSymbol,
43    /// Top-level `use` of a non-stdlib, non-self/super/crate path.
44    ExternalCrate,
45    /// Cluster of co-occurring concepts (Phase C).
46    ConceptCluster,
47    /// Free-form natural-language phrase pulled from docs/comments.
48    NaturalLanguagePhrase,
49}
50
51impl EntityType {
52    /// Stable string tag used in `RawEntity::new` id hashing. Changing any of
53    /// these strings invalidates previously persisted entity ids.
54    pub fn as_str(&self) -> &'static str {
55        match self {
56            Self::NamedType => "NamedType",
57            Self::TraitBound => "TraitBound",
58            Self::ModulePath => "ModulePath",
59            Self::ErrorVariant => "ErrorVariant",
60            Self::TestRelation => "TestRelation",
61            Self::DocConcept => "DocConcept",
62            Self::Annotation => "Annotation",
63            Self::LiteralString => "LiteralString",
64            Self::TypeAlias => "TypeAlias",
65            Self::ConstantSymbol => "ConstantSymbol",
66            Self::ExternalCrate => "ExternalCrate",
67            Self::ConceptCluster => "ConceptCluster",
68            Self::NaturalLanguagePhrase => "NaturalLanguagePhrase",
69        }
70    }
71}
72
73/// Edge kinds for the `SymbolGraph` knowledge graph.
74///
75/// Phase A = structural (tree-sitter derived)
76/// Phase B = test-relation
77/// Phase C = doc/concept
78#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
79pub enum EdgeKind {
80    // Call graph
81    /// Caller → callee.
82    CallsFunction,
83    /// Callee → caller (reverse index of `CallsFunction`).
84    CalledByFunction,
85    // Phase A — structural
86    Implements,
87    UsesType,
88    Derives,
89    ModuleContains,
90    ReExports,
91    RaisesError,
92    Configures,
93    // Phase B — test relations
94    TestedBy,
95    TestUsesFixture,
96    CoOccursInTest,
97    // Phase C — docs / concepts
98    Documents,
99    ReferencesConcept,
100    Aliases,
101    ErrorDescribes,
102}
103
104impl EdgeKind {
105    /// Score multiplier for KG expansion. Higher = more relevant when ranking
106    /// neighbours discovered by walking this edge.
107    pub fn score_multiplier(&self) -> f32 {
108        match self {
109            EdgeKind::Implements => 0.85,
110            EdgeKind::UsesType => 0.75,
111            EdgeKind::TestedBy => 0.80,
112            EdgeKind::Documents => 0.65,
113            EdgeKind::ReferencesConcept => 0.60,
114            // Remaining edges use the legacy flat KG-expansion multiplier.
115            _ => 0.70,
116        }
117    }
118}
119
120/// redb table name constants for entity storage.
121pub mod tables {
122    /// `entity_id (str) -> RawEntity (bincode/json)`
123    pub const ENTITIES: &str = "entities";
124    /// `(from_entity_id, edge_kind, to_entity_id) -> ()`
125    pub const ENTITY_EDGES: &str = "entity_edges";
126    /// `chunk_id -> Vec<entity_id>`
127    pub const CHUNK_ENTITIES: &str = "chunk_entities";
128    /// `entity_id -> Vec<chunk_id>` (reverse index of `CHUNK_ENTITIES`)
129    pub const ENTITY_CHUNKS: &str = "entity_chunks";
130}
131
132/// One extracted entity, anchored to a byte span and source line.
133#[derive(Debug, Clone, Serialize, Deserialize)]
134pub struct RawEntity {
135    /// Stable hash of (entity_type, text, file).
136    pub id: String,
137    pub entity_type: EntityType,
138    pub text: String,
139    pub span: (usize, usize),
140    pub file: String,
141    pub line: usize,
142}
143
144impl RawEntity {
145    /// Construct a `RawEntity` with a deterministic SHA-256 id derived from
146    /// `(entity_type, text, file)`. Same inputs always yield the same id, so
147    /// re-extraction over identical source produces stable references for the
148    /// KG layer.
149    pub fn new(
150        entity_type: EntityType,
151        text: String,
152        span: (usize, usize),
153        file: &str,
154        line: usize,
155    ) -> Self {
156        let mut h = Sha256::new();
157        h.update(entity_type.as_str().as_bytes());
158        h.update(b"\0");
159        h.update(text.as_bytes());
160        h.update(b"\0");
161        h.update(file.as_bytes());
162        let id = format!("{:x}", h.finalize());
163        Self {
164            id,
165            entity_type,
166            text,
167            span,
168            file: file.to_string(),
169            line,
170        }
171    }
172}
173
174/// Short, stable hex hash of a string. Used by ingest sources (e.g. SCIP) to
175/// derive readable, collision-resistant entity IDs from opaque symbol strings.
176///
177/// Why: SCIP symbol strings (e.g. `"rust-analyzer cargo crate/Foo#"`) are
178/// long and noisy. Hashing them produces a compact, stable suffix safe to
179/// embed in entity ids and redb keys.
180/// What: hashes `s` with `DefaultHasher` and formats as 8-char lowercase hex.
181/// Test: `fact_hash_str_is_deterministic`.
182pub fn fact_hash_str(s: &str) -> String {
183    use std::hash::{Hash, Hasher};
184    let mut h = std::collections::hash_map::DefaultHasher::new();
185    s.hash(&mut h);
186    format!("{:08x}", h.finish())
187}
188
189#[cfg(test)]
190mod tests {
191    use super::*;
192
193    #[test]
194    fn raw_entity_id_is_stable() {
195        let a = RawEntity::new(EntityType::NamedType, "Foo".into(), (0, 3), "src/x.rs", 1);
196        let b = RawEntity::new(
197            EntityType::NamedType,
198            "Foo".into(),
199            (10, 13),
200            "src/x.rs",
201            99,
202        );
203        // Same (type, text, file) → same id even when span/line differ.
204        assert_eq!(a.id, b.id);
205    }
206
207    #[test]
208    fn raw_entity_id_changes_with_type() {
209        let a = RawEntity::new(EntityType::NamedType, "Foo".into(), (0, 3), "src/x.rs", 1);
210        let b = RawEntity::new(EntityType::ModulePath, "Foo".into(), (0, 3), "src/x.rs", 1);
211        assert_ne!(a.id, b.id);
212    }
213
214    #[test]
215    fn edge_kind_score_multiplier_known_values() {
216        assert!((EdgeKind::Implements.score_multiplier() - 0.85).abs() < 1e-6);
217        assert!((EdgeKind::UsesType.score_multiplier() - 0.75).abs() < 1e-6);
218        assert!((EdgeKind::TestedBy.score_multiplier() - 0.80).abs() < 1e-6);
219        assert!((EdgeKind::Documents.score_multiplier() - 0.65).abs() < 1e-6);
220        assert!((EdgeKind::ReferencesConcept.score_multiplier() - 0.60).abs() < 1e-6);
221        // Default branch.
222        assert!((EdgeKind::CallsFunction.score_multiplier() - 0.70).abs() < 1e-6);
223    }
224
225    #[test]
226    fn fact_hash_str_is_deterministic() {
227        let a = fact_hash_str("rust-analyzer cargo crate/Foo#");
228        let b = fact_hash_str("rust-analyzer cargo crate/Foo#");
229        assert_eq!(a, b);
230        // u64 in lowercase hex; `{:08x}` is the *min* width, so output is
231        // up to 16 characters (and always at least 8 due to zero-padding).
232        assert!(a.len() >= 8 && a.len() <= 16);
233        assert!(a.chars().all(|c| c.is_ascii_hexdigit()));
234    }
235
236    #[test]
237    fn entity_type_as_str_round_trip() {
238        // Just ensure every variant has a non-empty tag.
239        let variants = [
240            EntityType::NamedType,
241            EntityType::TraitBound,
242            EntityType::ModulePath,
243            EntityType::ErrorVariant,
244            EntityType::TestRelation,
245            EntityType::DocConcept,
246            EntityType::Annotation,
247            EntityType::LiteralString,
248            EntityType::TypeAlias,
249            EntityType::ConstantSymbol,
250            EntityType::ExternalCrate,
251            EntityType::ConceptCluster,
252            EntityType::NaturalLanguagePhrase,
253        ];
254        for v in variants {
255            assert!(!v.as_str().is_empty());
256        }
257    }
258}