1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3use std::path::PathBuf;
4
5#[derive(Debug, Clone, Serialize, Deserialize)]
7pub struct ScraperConfig {
8 pub registry_url: Option<String>,
10
11 pub auth_token: Option<String>,
13
14 pub namespace: Option<String>,
16
17 pub chunking_strategy: ChunkingStrategy,
19
20 pub max_chunk_size: usize,
22
23 pub include_overlap: bool,
25
26 pub overlap_size: usize,
28
29 pub language_options: HashMap<String, LanguageConfig>,
31
32 pub exclude_patterns: Vec<String>,
34
35 pub create_hierarchy: bool,
37
38 pub extract_api_surface: bool,
40
41 pub detect_licenses: bool,
43
44 pub request_timeout: u64,
46
47 pub rate_limit: f64,
49
50 pub cache_dir: Option<PathBuf>,
52}
53
54#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct LanguageConfig {
57 pub min_semantic_size: usize,
59
60 pub split_by_semantic_boundary: bool,
62
63 pub extract_functions: bool,
65
66 pub extract_types: bool,
68
69 pub extract_classes: bool,
71}
72
73#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
75#[derive(Default)]
76pub enum ChunkingStrategy {
77 #[default]
79 ByFile,
80
81 Semantic,
83
84 FixedSize,
86
87 Hierarchical,
89
90 ByLineCount,
92}
93
94
95impl Default for ScraperConfig {
96 fn default() -> Self {
97 let mut language_options = HashMap::new();
98
99 for lang in &["rust", "typescript", "python", "javascript", "go", "c"] {
101 language_options.insert(
102 lang.to_string(),
103 LanguageConfig {
104 min_semantic_size: 100,
105 split_by_semantic_boundary: true,
106 extract_functions: true,
107 extract_types: true,
108 extract_classes: true,
109 },
110 );
111 }
112
113 Self {
114 registry_url: None,
115 auth_token: None,
116 namespace: None,
117 chunking_strategy: ChunkingStrategy::ByFile,
118 max_chunk_size: 50 * 1024 * 1024, include_overlap: true,
120 overlap_size: 500,
121 language_options,
122 exclude_patterns: vec![
123 "**/.git".to_string(),
124 "**/node_modules".to_string(),
125 "**/target".to_string(),
126 "**/.venv".to_string(),
127 "**/dist".to_string(),
128 "**/build".to_string(),
129 "**/.DS_Store".to_string(),
130 ],
131 create_hierarchy: true,
132 extract_api_surface: true,
133 detect_licenses: true,
134 request_timeout: 30,
135 rate_limit: 10.0,
136 cache_dir: None,
137 }
138 }
139}
140
141#[derive(Debug, Clone, Serialize, Deserialize)]
143pub enum ScraperInput {
144 LocalPath(PathBuf),
146
147 Url(String),
149
150 GitRepo {
152 url: String,
153 #[serde(skip_serializing_if = "Option::is_none")]
154 branch: Option<String>,
155 #[serde(skip_serializing_if = "Option::is_none")]
156 commit: Option<String>,
157 },
158
159 Directory {
161 path: PathBuf,
162 #[serde(skip_serializing_if = "Option::is_none")]
163 patterns: Option<Vec<String>>,
164 },
165}
166
167#[derive(Debug, Clone, Serialize, Deserialize)]
169pub struct ScraperOutput {
170 pub chunk_count: usize,
172
173 pub file_count: usize,
175
176 pub total_bytes: u64,
178
179 pub chunks: Vec<ScrapedChunk>,
181
182 pub manifest: Option<serde_json::Value>,
184
185 pub errors: Vec<String>,
187
188 pub duration_ms: u128,
190}
191
192#[derive(Debug, Clone, Serialize, Deserialize)]
194pub struct ScrapedChunk {
195 pub chunk_id: String,
197
198 pub cadi_type: String,
200
201 pub name: String,
203
204 pub description: Option<String>,
206
207 pub source: String,
209
210 pub content_hash: String,
212
213 pub size: usize,
215
216 pub language: Option<String>,
218
219 pub concepts: Vec<String>,
221
222 pub dependencies: Vec<String>,
224
225 pub license: Option<String>,
227
228 pub parent_chunk_id: Option<String>,
230
231 pub child_chunk_ids: Vec<String>,
233
234 pub tags: Vec<String>,
236
237 pub scraped_at: String,
239}