cadi_scraper/
types.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3use std::path::PathBuf;
4
5/// Configuration for the scraper
6#[derive(Debug, Clone, Serialize, Deserialize)]
7pub struct ScraperConfig {
8    /// Registry URL for publishing chunks
9    pub registry_url: Option<String>,
10
11    /// Authentication token for registry
12    pub auth_token: Option<String>,
13
14    /// Namespace for published chunks
15    pub namespace: Option<String>,
16
17    /// Chunking strategy to use
18    pub chunking_strategy: ChunkingStrategy,
19
20    /// Maximum chunk size in bytes
21    pub max_chunk_size: usize,
22
23    /// Include overlapping context between chunks
24    pub include_overlap: bool,
25
26    /// Overlap size in characters
27    pub overlap_size: usize,
28
29    /// Language-specific parsing options
30    pub language_options: HashMap<String, LanguageConfig>,
31
32    /// Patterns to exclude from scraping
33    pub exclude_patterns: Vec<String>,
34
35    /// Whether to create hierarchical chunk relationships
36    pub create_hierarchy: bool,
37
38    /// Auto-extract API surfaces and functions
39    pub extract_api_surface: bool,
40
41    /// Whether to auto-detect licenses
42    pub detect_licenses: bool,
43
44    /// HTTP request timeout in seconds
45    pub request_timeout: u64,
46
47    /// Rate limit: requests per second
48    pub rate_limit: f64,
49
50    /// Local cache directory
51    pub cache_dir: Option<PathBuf>,
52}
53
54/// Language-specific configuration
55#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct LanguageConfig {
57    /// Minimum chunk size for semantic boundaries
58    pub min_semantic_size: usize,
59
60    /// Whether to split by function/class boundaries
61    pub split_by_semantic_boundary: bool,
62
63    /// Extract function signatures
64    pub extract_functions: bool,
65
66    /// Extract type definitions
67    pub extract_types: bool,
68
69    /// Extract class/struct definitions
70    pub extract_classes: bool,
71}
72
73/// Chunking strategy
74#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
75pub enum ChunkingStrategy {
76    /// Chunk by individual file
77    ByFile,
78
79    /// Chunk by semantic boundaries (functions, classes)
80    Semantic,
81
82    /// Chunk by fixed size
83    FixedSize,
84
85    /// Recursive hierarchical chunking
86    Hierarchical,
87
88    /// Chunk by line count
89    ByLineCount,
90}
91
92impl Default for ChunkingStrategy {
93    fn default() -> Self {
94        ChunkingStrategy::ByFile
95    }
96}
97
98impl Default for ScraperConfig {
99    fn default() -> Self {
100        let mut language_options = HashMap::new();
101        
102        // Default configs for common languages
103        for lang in &["rust", "typescript", "python", "javascript", "go", "c"] {
104            language_options.insert(
105                lang.to_string(),
106                LanguageConfig {
107                    min_semantic_size: 100,
108                    split_by_semantic_boundary: true,
109                    extract_functions: true,
110                    extract_types: true,
111                    extract_classes: true,
112                },
113            );
114        }
115
116        Self {
117            registry_url: None,
118            auth_token: None,
119            namespace: None,
120            chunking_strategy: ChunkingStrategy::ByFile,
121            max_chunk_size: 50 * 1024 * 1024, // 50MB default
122            include_overlap: true,
123            overlap_size: 500,
124            language_options,
125            exclude_patterns: vec![
126                "**/.git".to_string(),
127                "**/node_modules".to_string(),
128                "**/target".to_string(),
129                "**/.venv".to_string(),
130                "**/dist".to_string(),
131                "**/build".to_string(),
132                "**/.DS_Store".to_string(),
133            ],
134            create_hierarchy: true,
135            extract_api_surface: true,
136            detect_licenses: true,
137            request_timeout: 30,
138            rate_limit: 10.0,
139            cache_dir: None,
140        }
141    }
142}
143
144/// Input to the scraper
145#[derive(Debug, Clone, Serialize, Deserialize)]
146pub enum ScraperInput {
147    /// Local file path
148    LocalPath(PathBuf),
149
150    /// HTTP(S) URL
151    Url(String),
152
153    /// Git repository URL
154    GitRepo {
155        url: String,
156        #[serde(skip_serializing_if = "Option::is_none")]
157        branch: Option<String>,
158        #[serde(skip_serializing_if = "Option::is_none")]
159        commit: Option<String>,
160    },
161
162    /// Directory path with optional filters
163    Directory {
164        path: PathBuf,
165        #[serde(skip_serializing_if = "Option::is_none")]
166        patterns: Option<Vec<String>>,
167    },
168}
169
170/// Output from the scraper
171#[derive(Debug, Clone, Serialize, Deserialize)]
172pub struct ScraperOutput {
173    /// Total chunks created
174    pub chunk_count: usize,
175
176    /// Total files processed
177    pub file_count: usize,
178
179    /// Total bytes processed
180    pub total_bytes: u64,
181
182    /// Chunks with their metadata
183    pub chunks: Vec<ScrapedChunk>,
184
185    /// Manifest for all chunks
186    pub manifest: Option<serde_json::Value>,
187
188    /// Errors encountered during scraping
189    pub errors: Vec<String>,
190
191    /// Time taken (milliseconds)
192    pub duration_ms: u128,
193}
194
195/// A single scraped chunk with metadata
196#[derive(Debug, Clone, Serialize, Deserialize)]
197pub struct ScrapedChunk {
198    /// Chunk ID (content hash)
199    pub chunk_id: String,
200
201    /// Chunk type (source-cadi, etc)
202    pub cadi_type: String,
203
204    /// Human-readable name
205    pub name: String,
206
207    /// Description
208    pub description: Option<String>,
209
210    /// Source path or URL
211    pub source: String,
212
213    /// Content hash
214    pub content_hash: String,
215
216    /// File size
217    pub size: usize,
218
219    /// Language/format detected
220    pub language: Option<String>,
221
222    /// Detected concepts/tags
223    pub concepts: Vec<String>,
224
225    /// Detected dependencies
226    pub dependencies: Vec<String>,
227
228    /// License if detected
229    pub license: Option<String>,
230
231    /// Parent chunk ID for hierarchical relationships
232    pub parent_chunk_id: Option<String>,
233
234    /// Child chunk IDs
235    pub child_chunk_ids: Vec<String>,
236
237    /// Metadata tags
238    pub tags: Vec<String>,
239
240    /// Timestamp when scraped
241    pub scraped_at: String,
242}