1use serde::{Deserialize, Serialize};
7
8use super::error::EmbedError;
9
10#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
33pub struct RepoIdentifier {
34 pub namespace: String,
37
38 pub name: String,
40
41 #[serde(skip_serializing_if = "Option::is_none")]
43 pub version: Option<String>,
44
45 #[serde(skip_serializing_if = "Option::is_none")]
47 pub branch: Option<String>,
48
49 #[serde(skip_serializing_if = "Option::is_none")]
51 pub commit: Option<String>,
52}
53
54impl RepoIdentifier {
55 pub fn new(namespace: impl Into<String>, name: impl Into<String>) -> Self {
57 Self {
58 namespace: namespace.into(),
59 name: name.into(),
60 version: None,
61 branch: None,
62 commit: None,
63 }
64 }
65
66 pub fn full(
68 namespace: impl Into<String>,
69 name: impl Into<String>,
70 version: Option<String>,
71 branch: Option<String>,
72 commit: Option<String>,
73 ) -> Self {
74 Self {
75 namespace: namespace.into(),
76 name: name.into(),
77 version,
78 branch,
79 commit,
80 }
81 }
82
83 pub fn qualified_name(&self) -> String {
85 if self.namespace.is_empty() {
86 self.name.clone()
87 } else {
88 format!("{}/{}", self.namespace, self.name)
89 }
90 }
91
92 pub fn same_repo(&self, other: &Self) -> bool {
94 self.namespace == other.namespace && self.name == other.name
95 }
96}
97
98#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
106pub struct EmbedChunk {
107 pub id: String,
110
111 pub full_hash: String,
113
114 pub content: String,
116
117 pub tokens: u32,
119
120 pub kind: ChunkKind,
122
123 pub source: ChunkSource,
125
126 pub context: ChunkContext,
128
129 #[serde(skip_serializing_if = "Option::is_none")]
131 pub part: Option<ChunkPart>,
132}
133
134#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
139pub struct ChunkSource {
140 #[serde(default, skip_serializing_if = "is_default_repo")]
143 pub repo: RepoIdentifier,
144
145 pub file: String,
147
148 pub lines: (u32, u32),
150
151 pub symbol: String,
153
154 #[serde(skip_serializing_if = "Option::is_none")]
156 pub fqn: Option<String>,
157
158 pub language: String,
160
161 #[serde(skip_serializing_if = "Option::is_none")]
163 pub parent: Option<String>,
164
165 pub visibility: Visibility,
167
168 pub is_test: bool,
170}
171
172fn is_default_repo(repo: &RepoIdentifier) -> bool {
174 repo.namespace.is_empty() && repo.name.is_empty()
175}
176
177#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
182pub struct ChunkContext {
183 #[serde(skip_serializing_if = "Option::is_none")]
185 pub docstring: Option<String>,
186
187 #[serde(skip_serializing_if = "Vec::is_empty", default)]
189 pub comments: Vec<String>,
190
191 #[serde(skip_serializing_if = "Option::is_none")]
193 pub signature: Option<String>,
194
195 #[serde(skip_serializing_if = "Vec::is_empty", default)]
197 pub calls: Vec<String>,
198
199 #[serde(skip_serializing_if = "Vec::is_empty", default)]
201 pub called_by: Vec<String>,
202
203 #[serde(skip_serializing_if = "Vec::is_empty", default)]
205 pub imports: Vec<String>,
206
207 #[serde(skip_serializing_if = "Vec::is_empty", default)]
209 pub tags: Vec<String>,
210
211 #[serde(skip_serializing_if = "is_zero", default)]
217 pub lines_of_code: u32,
218
219 #[serde(skip_serializing_if = "is_zero", default)]
222 pub max_nesting_depth: u32,
223}
224
225fn is_zero(n: &u32) -> bool {
227 *n == 0
228}
229
230fn default_hierarchy_min_children() -> usize {
232 2
233}
234
235#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
237#[serde(rename_all = "snake_case")]
238pub enum ChunkKind {
239 #[default]
240 Function,
241 Method,
242 Class,
243 Struct,
244 Enum,
245 Interface,
246 Trait,
247 Module,
248 Constant,
249 Variable,
250 Imports,
251 TopLevel,
252 FunctionPart,
253 ClassPart,
254}
255
256impl ChunkKind {
257 pub fn name(&self) -> &'static str {
259 match self {
260 Self::Function => "function",
261 Self::Method => "method",
262 Self::Class => "class",
263 Self::Struct => "struct",
264 Self::Enum => "enum",
265 Self::Interface => "interface",
266 Self::Trait => "trait",
267 Self::Module => "module",
268 Self::Constant => "constant",
269 Self::Variable => "variable",
270 Self::Imports => "imports",
271 Self::TopLevel => "top_level",
272 Self::FunctionPart => "function_part",
273 Self::ClassPart => "class_part",
274 }
275 }
276
277 pub fn is_part(&self) -> bool {
279 matches!(self, Self::FunctionPart | Self::ClassPart)
280 }
281}
282
283#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
285#[serde(rename_all = "snake_case")]
286pub enum Visibility {
287 #[default]
288 Public,
289 Private,
290 Protected,
291 Internal,
292}
293
294impl Visibility {
295 pub fn name(&self) -> &'static str {
297 match self {
298 Self::Public => "public",
299 Self::Private => "private",
300 Self::Protected => "protected",
301 Self::Internal => "internal",
302 }
303 }
304}
305
306#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
308pub struct ChunkPart {
309 pub part: u32,
311
312 pub of: u32,
314
315 pub parent_id: String,
317
318 pub parent_signature: String,
320
321 #[serde(skip_serializing_if = "is_zero", default)]
324 pub overlap_lines: u32,
325}
326
327#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
333pub struct EmbedSettings {
334 pub max_tokens: u32,
336
337 pub min_tokens: u32,
339
340 pub overlap_tokens: u32,
342
343 pub context_lines: u32,
345
346 pub include_imports: bool,
348
349 pub include_top_level: bool,
351
352 pub token_model: String,
354
355 pub algorithm_version: u32,
357
358 pub scan_secrets: bool,
360
361 pub fail_on_secrets: bool,
363
364 pub redact_secrets: bool,
366
367 #[serde(default)]
370 pub include_patterns: Vec<String>,
371
372 #[serde(default)]
375 pub exclude_patterns: Vec<String>,
376
377 #[serde(default)]
379 pub include_tests: bool,
380
381 #[serde(default)]
389 pub enable_hierarchy: bool,
390
391 #[serde(default = "default_hierarchy_min_children")]
394 pub hierarchy_min_children: usize,
395}
396
397impl Default for EmbedSettings {
398 fn default() -> Self {
399 Self {
400 max_tokens: 1000, min_tokens: 50, overlap_tokens: 100, context_lines: 5, include_imports: true, include_top_level: true, token_model: "claude".to_string(),
407 algorithm_version: 1,
408 scan_secrets: true, fail_on_secrets: false,
410 redact_secrets: true, include_patterns: Vec::new(),
412 exclude_patterns: Vec::new(),
413 include_tests: false,
414 enable_hierarchy: false, hierarchy_min_children: 2, }
417 }
418}
419
420impl EmbedSettings {
421 pub const CURRENT_ALGORITHM_VERSION: u32 = 1;
423
424 pub const MAX_TOKENS_LIMIT: u32 = 100_000;
426
427 pub fn for_embedding_model(model: &str) -> Self {
435 let mut settings = Self::default();
436 settings.max_tokens = match model.to_lowercase().as_str() {
437 "voyage-code-2" | "voyage-code-3" => 1500,
438 "cohere-embed-v3" | "cohere" => 400,
439 "openai-text-embedding-3-small" | "openai-text-embedding-3-large" | "openai" => 800,
440 "sentence-transformers" | "all-minilm" | "minilm" => 384,
441 _ => 1000, };
443 settings
444 }
445
446 pub fn validate(&self) -> Result<(), EmbedError> {
448 if self.max_tokens > Self::MAX_TOKENS_LIMIT {
449 return Err(EmbedError::InvalidSettings {
450 field: "max_tokens".to_string(),
451 reason: format!("exceeds limit of {}", Self::MAX_TOKENS_LIMIT),
452 });
453 }
454 if self.min_tokens > self.max_tokens {
455 return Err(EmbedError::InvalidSettings {
456 field: "min_tokens".to_string(),
457 reason: "cannot exceed max_tokens".to_string(),
458 });
459 }
460 if self.algorithm_version > Self::CURRENT_ALGORITHM_VERSION {
461 return Err(EmbedError::UnsupportedAlgorithmVersion {
462 found: self.algorithm_version,
463 max_supported: Self::CURRENT_ALGORITHM_VERSION,
464 });
465 }
466 Ok(())
467 }
468
469 pub fn for_ci() -> Self {
473 Self {
474 fail_on_secrets: true,
475 scan_secrets: true,
476 redact_secrets: false, ..Self::default()
478 }
479 }
480}
481
482impl From<crate::types::SymbolKind> for ChunkKind {
484 fn from(kind: crate::types::SymbolKind) -> Self {
485 match kind {
486 crate::types::SymbolKind::Function => ChunkKind::Function,
487 crate::types::SymbolKind::Method => ChunkKind::Method,
488 crate::types::SymbolKind::Class => ChunkKind::Class,
489 crate::types::SymbolKind::Struct => ChunkKind::Struct,
490 crate::types::SymbolKind::Enum => ChunkKind::Enum,
491 crate::types::SymbolKind::Interface => ChunkKind::Interface,
492 crate::types::SymbolKind::Trait => ChunkKind::Trait,
493 crate::types::SymbolKind::Import => ChunkKind::Imports,
494 crate::types::SymbolKind::Constant => ChunkKind::Constant,
495 crate::types::SymbolKind::Variable => ChunkKind::Variable,
496 crate::types::SymbolKind::TypeAlias => ChunkKind::Struct, crate::types::SymbolKind::Export => ChunkKind::Imports, crate::types::SymbolKind::Module => ChunkKind::Module,
499 crate::types::SymbolKind::Macro => ChunkKind::Function, }
501 }
502}
503
504impl From<crate::types::Visibility> for Visibility {
506 fn from(vis: crate::types::Visibility) -> Self {
507 match vis {
508 crate::types::Visibility::Public => Visibility::Public,
509 crate::types::Visibility::Private => Visibility::Private,
510 crate::types::Visibility::Protected => Visibility::Protected,
511 crate::types::Visibility::Internal => Visibility::Internal,
512 }
513 }
514}
515
516#[cfg(test)]
517mod tests {
518 use super::*;
519
520 #[test]
521 fn test_default_settings() {
522 let settings = EmbedSettings::default();
523 assert_eq!(settings.max_tokens, 1000);
524 assert_eq!(settings.min_tokens, 50);
525 assert_eq!(settings.overlap_tokens, 100);
526 assert!(settings.scan_secrets);
527 }
528
529 #[test]
530 fn test_validate_settings() {
531 let mut settings = EmbedSettings::default();
532 assert!(settings.validate().is_ok());
533
534 settings.max_tokens = 200_000;
536 assert!(settings.validate().is_err());
537
538 settings.max_tokens = 100;
540 settings.min_tokens = 200;
541 assert!(settings.validate().is_err());
542 }
543
544 #[test]
545 fn test_for_embedding_model() {
546 let voyage = EmbedSettings::for_embedding_model("voyage-code-2");
547 assert_eq!(voyage.max_tokens, 1500);
548
549 let cohere = EmbedSettings::for_embedding_model("cohere");
550 assert_eq!(cohere.max_tokens, 400);
551
552 let unknown = EmbedSettings::for_embedding_model("unknown-model");
553 assert_eq!(unknown.max_tokens, 1000);
554 }
555
556 #[test]
557 fn test_chunk_kind_name() {
558 assert_eq!(ChunkKind::Function.name(), "function");
559 assert_eq!(ChunkKind::FunctionPart.name(), "function_part");
560 }
561
562 #[test]
563 fn test_chunk_kind_is_part() {
564 assert!(ChunkKind::FunctionPart.is_part());
565 assert!(ChunkKind::ClassPart.is_part());
566 assert!(!ChunkKind::Function.is_part());
567 }
568
569 #[test]
570 fn test_visibility_name() {
571 assert_eq!(Visibility::Public.name(), "public");
572 assert_eq!(Visibility::Private.name(), "private");
573 }
574
575 #[test]
576 fn test_settings_serialization() {
577 let settings = EmbedSettings::default();
578 let json = serde_json::to_string(&settings).unwrap();
579 let deserialized: EmbedSettings = serde_json::from_str(&json).unwrap();
580 assert_eq!(settings, deserialized);
581 }
582
583 #[test]
584 fn test_ci_settings() {
585 let ci = EmbedSettings::for_ci();
586 assert!(ci.fail_on_secrets);
587 assert!(ci.scan_secrets);
588 assert!(!ci.redact_secrets);
589 }
590}