1use serde::{Deserialize, Serialize};
7
8use super::error::EmbedError;
9
10#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
33pub struct RepoIdentifier {
34 pub namespace: String,
37
38 pub name: String,
40
41 #[serde(skip_serializing_if = "Option::is_none")]
43 pub version: Option<String>,
44
45 #[serde(skip_serializing_if = "Option::is_none")]
47 pub branch: Option<String>,
48
49 #[serde(skip_serializing_if = "Option::is_none")]
51 pub commit: Option<String>,
52}
53
54impl RepoIdentifier {
55 pub fn new(namespace: impl Into<String>, name: impl Into<String>) -> Self {
57 Self {
58 namespace: namespace.into(),
59 name: name.into(),
60 version: None,
61 branch: None,
62 commit: None,
63 }
64 }
65
66 pub fn full(
68 namespace: impl Into<String>,
69 name: impl Into<String>,
70 version: Option<String>,
71 branch: Option<String>,
72 commit: Option<String>,
73 ) -> Self {
74 Self { namespace: namespace.into(), name: name.into(), version, branch, commit }
75 }
76
77 pub fn qualified_name(&self) -> String {
79 if self.namespace.is_empty() {
80 self.name.clone()
81 } else {
82 format!("{}/{}", self.namespace, self.name)
83 }
84 }
85
86 pub fn same_repo(&self, other: &Self) -> bool {
88 self.namespace == other.namespace && self.name == other.name
89 }
90}
91
92#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
100pub struct EmbedChunk {
101 pub id: String,
104
105 pub full_hash: String,
107
108 pub content: String,
110
111 pub tokens: u32,
113
114 pub kind: ChunkKind,
116
117 pub source: ChunkSource,
119
120 pub context: ChunkContext,
122
123 #[serde(skip_serializing_if = "Option::is_none")]
125 pub part: Option<ChunkPart>,
126}
127
128#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
133pub struct ChunkSource {
134 #[serde(default, skip_serializing_if = "is_default_repo")]
137 pub repo: RepoIdentifier,
138
139 pub file: String,
141
142 pub lines: (u32, u32),
144
145 pub symbol: String,
147
148 #[serde(skip_serializing_if = "Option::is_none")]
150 pub fqn: Option<String>,
151
152 pub language: String,
154
155 #[serde(skip_serializing_if = "Option::is_none")]
157 pub parent: Option<String>,
158
159 pub visibility: Visibility,
161
162 pub is_test: bool,
164}
165
166fn is_default_repo(repo: &RepoIdentifier) -> bool {
168 repo.namespace.is_empty() && repo.name.is_empty()
169}
170
171#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
176pub struct ChunkContext {
177 #[serde(skip_serializing_if = "Option::is_none")]
179 pub docstring: Option<String>,
180
181 #[serde(skip_serializing_if = "Vec::is_empty", default)]
183 pub comments: Vec<String>,
184
185 #[serde(skip_serializing_if = "Option::is_none")]
187 pub signature: Option<String>,
188
189 #[serde(skip_serializing_if = "Vec::is_empty", default)]
191 pub calls: Vec<String>,
192
193 #[serde(skip_serializing_if = "Vec::is_empty", default)]
195 pub called_by: Vec<String>,
196
197 #[serde(skip_serializing_if = "Vec::is_empty", default)]
199 pub imports: Vec<String>,
200
201 #[serde(skip_serializing_if = "Vec::is_empty", default)]
203 pub tags: Vec<String>,
204
205 #[serde(skip_serializing_if = "is_zero", default)]
210 pub lines_of_code: u32,
211
212 #[serde(skip_serializing_if = "is_zero", default)]
215 pub max_nesting_depth: u32,
216}
217
218fn is_zero(n: &u32) -> bool {
220 *n == 0
221}
222
223fn default_hierarchy_min_children() -> usize {
225 2
226}
227
228#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
230#[serde(rename_all = "snake_case")]
231pub enum ChunkKind {
232 #[default]
233 Function,
234 Method,
235 Class,
236 Struct,
237 Enum,
238 Interface,
239 Trait,
240 Module,
241 Constant,
242 Variable,
243 Imports,
244 TopLevel,
245 FunctionPart,
246 ClassPart,
247}
248
249impl ChunkKind {
250 pub fn name(&self) -> &'static str {
252 match self {
253 Self::Function => "function",
254 Self::Method => "method",
255 Self::Class => "class",
256 Self::Struct => "struct",
257 Self::Enum => "enum",
258 Self::Interface => "interface",
259 Self::Trait => "trait",
260 Self::Module => "module",
261 Self::Constant => "constant",
262 Self::Variable => "variable",
263 Self::Imports => "imports",
264 Self::TopLevel => "top_level",
265 Self::FunctionPart => "function_part",
266 Self::ClassPart => "class_part",
267 }
268 }
269
270 pub fn is_part(&self) -> bool {
272 matches!(self, Self::FunctionPart | Self::ClassPart)
273 }
274}
275
276#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
278#[serde(rename_all = "snake_case")]
279pub enum Visibility {
280 #[default]
281 Public,
282 Private,
283 Protected,
284 Internal,
285}
286
287impl Visibility {
288 pub fn name(&self) -> &'static str {
290 match self {
291 Self::Public => "public",
292 Self::Private => "private",
293 Self::Protected => "protected",
294 Self::Internal => "internal",
295 }
296 }
297}
298
299#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
301pub struct ChunkPart {
302 pub part: u32,
304
305 pub of: u32,
307
308 pub parent_id: String,
310
311 pub parent_signature: String,
313
314 #[serde(skip_serializing_if = "is_zero", default)]
317 pub overlap_lines: u32,
318}
319
320#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
326pub struct EmbedSettings {
327 pub max_tokens: u32,
329
330 pub min_tokens: u32,
332
333 pub overlap_tokens: u32,
335
336 pub context_lines: u32,
338
339 pub include_imports: bool,
341
342 pub include_top_level: bool,
344
345 pub token_model: String,
347
348 pub algorithm_version: u32,
350
351 pub scan_secrets: bool,
353
354 pub fail_on_secrets: bool,
356
357 pub redact_secrets: bool,
359
360 #[serde(default)]
363 pub include_patterns: Vec<String>,
364
365 #[serde(default)]
368 pub exclude_patterns: Vec<String>,
369
370 #[serde(default)]
372 pub include_tests: bool,
373
374 #[serde(default)]
382 pub enable_hierarchy: bool,
383
384 #[serde(default = "default_hierarchy_min_children")]
387 pub hierarchy_min_children: usize,
388}
389
390impl Default for EmbedSettings {
391 fn default() -> Self {
392 Self {
393 max_tokens: 1000, min_tokens: 50, overlap_tokens: 100, context_lines: 5, include_imports: true, include_top_level: true, token_model: "claude".to_owned(),
400 algorithm_version: 1,
401 scan_secrets: true, fail_on_secrets: false,
403 redact_secrets: true, include_patterns: Vec::new(),
405 exclude_patterns: Vec::new(),
406 include_tests: false,
407 enable_hierarchy: false, hierarchy_min_children: 2, }
410 }
411}
412
413impl EmbedSettings {
414 pub const CURRENT_ALGORITHM_VERSION: u32 = 1;
416
417 pub const MAX_TOKENS_LIMIT: u32 = 100_000;
419
420 pub fn for_embedding_model(model: &str) -> Self {
428 let mut settings = Self::default();
429 settings.max_tokens = match model.to_lowercase().as_str() {
430 "voyage-code-2" | "voyage-code-3" => 1500,
431 "cohere-embed-v3" | "cohere" => 400,
432 "openai-text-embedding-3-small" | "openai-text-embedding-3-large" | "openai" => 800,
433 "sentence-transformers" | "all-minilm" | "minilm" => 384,
434 _ => 1000, };
436 settings
437 }
438
439 pub fn validate(&self) -> Result<(), EmbedError> {
441 if self.max_tokens > Self::MAX_TOKENS_LIMIT {
442 return Err(EmbedError::InvalidSettings {
443 field: "max_tokens".to_owned(),
444 reason: format!("exceeds limit of {}", Self::MAX_TOKENS_LIMIT),
445 });
446 }
447 if self.min_tokens > self.max_tokens {
448 return Err(EmbedError::InvalidSettings {
449 field: "min_tokens".to_owned(),
450 reason: "cannot exceed max_tokens".to_owned(),
451 });
452 }
453 if self.algorithm_version > Self::CURRENT_ALGORITHM_VERSION {
454 return Err(EmbedError::UnsupportedAlgorithmVersion {
455 found: self.algorithm_version,
456 max_supported: Self::CURRENT_ALGORITHM_VERSION,
457 });
458 }
459 Ok(())
460 }
461
462 pub fn for_ci() -> Self {
466 Self {
467 fail_on_secrets: true,
468 scan_secrets: true,
469 redact_secrets: false, ..Self::default()
471 }
472 }
473}
474
475impl From<crate::types::SymbolKind> for ChunkKind {
477 fn from(kind: crate::types::SymbolKind) -> Self {
478 match kind {
479 crate::types::SymbolKind::Function => ChunkKind::Function,
480 crate::types::SymbolKind::Method => ChunkKind::Method,
481 crate::types::SymbolKind::Class => ChunkKind::Class,
482 crate::types::SymbolKind::Struct => ChunkKind::Struct,
483 crate::types::SymbolKind::Enum => ChunkKind::Enum,
484 crate::types::SymbolKind::Interface => ChunkKind::Interface,
485 crate::types::SymbolKind::Trait => ChunkKind::Trait,
486 crate::types::SymbolKind::Import => ChunkKind::Imports,
487 crate::types::SymbolKind::Constant => ChunkKind::Constant,
488 crate::types::SymbolKind::Variable => ChunkKind::Variable,
489 crate::types::SymbolKind::TypeAlias => ChunkKind::Struct, crate::types::SymbolKind::Export => ChunkKind::Imports, crate::types::SymbolKind::Module => ChunkKind::Module,
492 crate::types::SymbolKind::Macro => ChunkKind::Function, }
494 }
495}
496
497impl From<crate::types::Visibility> for Visibility {
499 fn from(vis: crate::types::Visibility) -> Self {
500 match vis {
501 crate::types::Visibility::Public => Visibility::Public,
502 crate::types::Visibility::Private => Visibility::Private,
503 crate::types::Visibility::Protected => Visibility::Protected,
504 crate::types::Visibility::Internal => Visibility::Internal,
505 }
506 }
507}
508
509#[cfg(test)]
510mod tests {
511 use super::*;
512
513 #[test]
514 fn test_default_settings() {
515 let settings = EmbedSettings::default();
516 assert_eq!(settings.max_tokens, 1000);
517 assert_eq!(settings.min_tokens, 50);
518 assert_eq!(settings.overlap_tokens, 100);
519 assert!(settings.scan_secrets);
520 }
521
522 #[test]
523 fn test_validate_settings() {
524 let mut settings = EmbedSettings::default();
525 assert!(settings.validate().is_ok());
526
527 settings.max_tokens = 200_000;
529 assert!(settings.validate().is_err());
530
531 settings.max_tokens = 100;
533 settings.min_tokens = 200;
534 assert!(settings.validate().is_err());
535 }
536
537 #[test]
538 fn test_for_embedding_model() {
539 let voyage = EmbedSettings::for_embedding_model("voyage-code-2");
540 assert_eq!(voyage.max_tokens, 1500);
541
542 let cohere = EmbedSettings::for_embedding_model("cohere");
543 assert_eq!(cohere.max_tokens, 400);
544
545 let unknown = EmbedSettings::for_embedding_model("unknown-model");
546 assert_eq!(unknown.max_tokens, 1000);
547 }
548
549 #[test]
550 fn test_chunk_kind_name() {
551 assert_eq!(ChunkKind::Function.name(), "function");
552 assert_eq!(ChunkKind::FunctionPart.name(), "function_part");
553 }
554
555 #[test]
556 fn test_chunk_kind_is_part() {
557 assert!(ChunkKind::FunctionPart.is_part());
558 assert!(ChunkKind::ClassPart.is_part());
559 assert!(!ChunkKind::Function.is_part());
560 }
561
562 #[test]
563 fn test_visibility_name() {
564 assert_eq!(Visibility::Public.name(), "public");
565 assert_eq!(Visibility::Private.name(), "private");
566 }
567
568 #[test]
569 fn test_settings_serialization() {
570 let settings = EmbedSettings::default();
571 let json = serde_json::to_string(&settings).unwrap();
572 let deserialized: EmbedSettings = serde_json::from_str(&json).unwrap();
573 assert_eq!(settings, deserialized);
574 }
575
576 #[test]
577 fn test_ci_settings() {
578 let ci = EmbedSettings::for_ci();
579 assert!(ci.fail_on_secrets);
580 assert!(ci.scan_secrets);
581 assert!(!ci.redact_secrets);
582 }
583}