trusty_contracts/lib.rs
1//! Shared entity types for the trusty-* toolchain.
2//!
3//! Why: the knowledge-graph layer (`trusty-search-core::symbol_graph`), the
4//! analysis sidecar (`trusty-analyzer-core`), and ingest pipelines (SCIP, NER,
5//! concept-cluster) all consume the same `EntityType` / `EdgeKind` / `RawEntity`
6//! shapes. Extracting them into a tree-sitter–free crate lets analyzer crates
7//! depend on the shapes without inheriting 16 language grammars.
8//!
9//! What: pure data definitions — enums, structs, and the `fact_hash_str`
10//! helper. No async, no tokio, no tree-sitter. The tree-sitter–based
11//! extraction code (`extract_entities`, `EntityExtractor`, language-specific
12//! walkers) stays in `trusty-search-core::entity`.
13//!
14//! Test: see `#[cfg(test)]` in this file — covers `RawEntity::new` id
15//! stability, `EdgeKind::score_multiplier`, and `fact_hash_str` determinism.
16
17use serde::{Deserialize, Serialize};
18use sha2::{Digest, Sha256};
19
20/// Taxonomy of program entities surfaced from source code.
21#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
22pub enum EntityType {
23 /// Type identifiers (`Arc`, `Vec`, `CodeChunk`, …).
24 NamedType,
25 /// Trait bound expressions (`Send + Sync`, `Serialize`, …).
26 TraitBound,
27 /// Module paths (`crate::indexer::CodeIndexer`, `std::sync::Arc`).
28 ModulePath,
29 /// Error/panic call sites: `bail!`, `anyhow!`, `panic!`, `unwrap`.
30 ErrorVariant,
31 /// Identifiers referenced from `#[test]` function bodies.
32 TestRelation,
33 /// Doc-comment derived concept (NLP phrase / keyword).
34 DocConcept,
35 /// Attribute annotations (`#[derive(...)]`, `#[cfg(...)]`).
36 Annotation,
37 /// String literals longer than 10 characters.
38 LiteralString,
39 /// `type Foo = Bar` aliases.
40 TypeAlias,
41 /// Top-level `const`/`static` symbol.
42 ConstantSymbol,
43 /// Top-level `use` of a non-stdlib, non-self/super/crate path.
44 ExternalCrate,
45 /// Cluster of co-occurring concepts (Phase C).
46 ConceptCluster,
47 /// Free-form natural-language phrase pulled from docs/comments.
48 NaturalLanguagePhrase,
49}
50
51impl EntityType {
52 /// Stable string tag used in `RawEntity::new` id hashing. Changing any of
53 /// these strings invalidates previously persisted entity ids.
54 pub fn as_str(&self) -> &'static str {
55 match self {
56 Self::NamedType => "NamedType",
57 Self::TraitBound => "TraitBound",
58 Self::ModulePath => "ModulePath",
59 Self::ErrorVariant => "ErrorVariant",
60 Self::TestRelation => "TestRelation",
61 Self::DocConcept => "DocConcept",
62 Self::Annotation => "Annotation",
63 Self::LiteralString => "LiteralString",
64 Self::TypeAlias => "TypeAlias",
65 Self::ConstantSymbol => "ConstantSymbol",
66 Self::ExternalCrate => "ExternalCrate",
67 Self::ConceptCluster => "ConceptCluster",
68 Self::NaturalLanguagePhrase => "NaturalLanguagePhrase",
69 }
70 }
71}
72
73/// Edge kinds for the `SymbolGraph` knowledge graph.
74///
75/// Phase A = structural (tree-sitter derived)
76/// Phase B = test-relation
77/// Phase C = doc/concept
78#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
79pub enum EdgeKind {
80 // Call graph
81 /// Caller → callee.
82 CallsFunction,
83 /// Callee → caller (reverse index of `CallsFunction`).
84 CalledByFunction,
85 // Phase A — structural
86 Implements,
87 UsesType,
88 Derives,
89 ModuleContains,
90 ReExports,
91 RaisesError,
92 Configures,
93 // Phase B — test relations
94 TestedBy,
95 TestUsesFixture,
96 CoOccursInTest,
97 // Phase C — docs / concepts
98 Documents,
99 ReferencesConcept,
100 Aliases,
101 ErrorDescribes,
102}
103
104impl EdgeKind {
105 /// Score multiplier for KG expansion. Higher = more relevant when ranking
106 /// neighbours discovered by walking this edge.
107 pub fn score_multiplier(&self) -> f32 {
108 match self {
109 EdgeKind::Implements => 0.85,
110 EdgeKind::UsesType => 0.75,
111 EdgeKind::TestedBy => 0.80,
112 EdgeKind::Documents => 0.65,
113 EdgeKind::ReferencesConcept => 0.60,
114 // Remaining edges use the legacy flat KG-expansion multiplier.
115 _ => 0.70,
116 }
117 }
118}
119
120/// redb table name constants for entity storage.
121pub mod tables {
122 /// `entity_id (str) -> RawEntity (bincode/json)`
123 pub const ENTITIES: &str = "entities";
124 /// `(from_entity_id, edge_kind, to_entity_id) -> ()`
125 pub const ENTITY_EDGES: &str = "entity_edges";
126 /// `chunk_id -> Vec<entity_id>`
127 pub const CHUNK_ENTITIES: &str = "chunk_entities";
128 /// `entity_id -> Vec<chunk_id>` (reverse index of `CHUNK_ENTITIES`)
129 pub const ENTITY_CHUNKS: &str = "entity_chunks";
130}
131
132/// One extracted entity, anchored to a byte span and source line.
133#[derive(Debug, Clone, Serialize, Deserialize)]
134pub struct RawEntity {
135 /// Stable hash of (entity_type, text, file).
136 pub id: String,
137 pub entity_type: EntityType,
138 pub text: String,
139 pub span: (usize, usize),
140 pub file: String,
141 pub line: usize,
142}
143
144impl RawEntity {
145 /// Construct a `RawEntity` with a deterministic SHA-256 id derived from
146 /// `(entity_type, text, file)`. Same inputs always yield the same id, so
147 /// re-extraction over identical source produces stable references for the
148 /// KG layer.
149 pub fn new(
150 entity_type: EntityType,
151 text: String,
152 span: (usize, usize),
153 file: &str,
154 line: usize,
155 ) -> Self {
156 let mut h = Sha256::new();
157 h.update(entity_type.as_str().as_bytes());
158 h.update(b"\0");
159 h.update(text.as_bytes());
160 h.update(b"\0");
161 h.update(file.as_bytes());
162 let id = format!("{:x}", h.finalize());
163 Self {
164 id,
165 entity_type,
166 text,
167 span,
168 file: file.to_string(),
169 line,
170 }
171 }
172}
173
174/// Short, stable hex hash of a string. Used by ingest sources (e.g. SCIP) to
175/// derive readable, collision-resistant entity IDs from opaque symbol strings.
176///
177/// Why: SCIP symbol strings (e.g. `"rust-analyzer cargo crate/Foo#"`) are
178/// long and noisy. Hashing them produces a compact, stable suffix safe to
179/// embed in entity ids and redb keys.
180/// What: hashes `s` with `DefaultHasher` and formats as 8-char lowercase hex.
181/// Test: `fact_hash_str_is_deterministic`.
182pub fn fact_hash_str(s: &str) -> String {
183 use std::hash::{Hash, Hasher};
184 let mut h = std::collections::hash_map::DefaultHasher::new();
185 s.hash(&mut h);
186 format!("{:08x}", h.finish())
187}
188
189#[cfg(test)]
190mod tests {
191 use super::*;
192
193 #[test]
194 fn raw_entity_id_is_stable() {
195 let a = RawEntity::new(EntityType::NamedType, "Foo".into(), (0, 3), "src/x.rs", 1);
196 let b = RawEntity::new(
197 EntityType::NamedType,
198 "Foo".into(),
199 (10, 13),
200 "src/x.rs",
201 99,
202 );
203 // Same (type, text, file) → same id even when span/line differ.
204 assert_eq!(a.id, b.id);
205 }
206
207 #[test]
208 fn raw_entity_id_changes_with_type() {
209 let a = RawEntity::new(EntityType::NamedType, "Foo".into(), (0, 3), "src/x.rs", 1);
210 let b = RawEntity::new(EntityType::ModulePath, "Foo".into(), (0, 3), "src/x.rs", 1);
211 assert_ne!(a.id, b.id);
212 }
213
214 #[test]
215 fn edge_kind_score_multiplier_known_values() {
216 assert!((EdgeKind::Implements.score_multiplier() - 0.85).abs() < 1e-6);
217 assert!((EdgeKind::UsesType.score_multiplier() - 0.75).abs() < 1e-6);
218 assert!((EdgeKind::TestedBy.score_multiplier() - 0.80).abs() < 1e-6);
219 assert!((EdgeKind::Documents.score_multiplier() - 0.65).abs() < 1e-6);
220 assert!((EdgeKind::ReferencesConcept.score_multiplier() - 0.60).abs() < 1e-6);
221 // Default branch.
222 assert!((EdgeKind::CallsFunction.score_multiplier() - 0.70).abs() < 1e-6);
223 }
224
225 #[test]
226 fn fact_hash_str_is_deterministic() {
227 let a = fact_hash_str("rust-analyzer cargo crate/Foo#");
228 let b = fact_hash_str("rust-analyzer cargo crate/Foo#");
229 assert_eq!(a, b);
230 // u64 in lowercase hex; `{:08x}` is the *min* width, so output is
231 // up to 16 characters (and always at least 8 due to zero-padding).
232 assert!(a.len() >= 8 && a.len() <= 16);
233 assert!(a.chars().all(|c| c.is_ascii_hexdigit()));
234 }
235
236 #[test]
237 fn entity_type_as_str_round_trip() {
238 // Just ensure every variant has a non-empty tag.
239 let variants = [
240 EntityType::NamedType,
241 EntityType::TraitBound,
242 EntityType::ModulePath,
243 EntityType::ErrorVariant,
244 EntityType::TestRelation,
245 EntityType::DocConcept,
246 EntityType::Annotation,
247 EntityType::LiteralString,
248 EntityType::TypeAlias,
249 EntityType::ConstantSymbol,
250 EntityType::ExternalCrate,
251 EntityType::ConceptCluster,
252 EntityType::NaturalLanguagePhrase,
253 ];
254 for v in variants {
255 assert!(!v.as_str().is_empty());
256 }
257 }
258}