Skip to main content

cpd_core/
models.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3
4#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
5#[serde(rename_all = "snake_case")]
6pub enum TokenKind {
7    Keyword,
8    Identifier,
9    Literal,
10    Operator,
11    Punctuation,
12    Comment,
13    BlockComment,
14    Whitespace,
15    Ignore,
16    Other,
17}
18
19impl TokenKind {
20    /// Return a stable byte discriminant for use in token hashing.
21    pub fn discriminant(&self) -> u8 {
22        match self {
23            Self::Keyword => 1,
24            Self::Identifier => 2,
25            Self::Literal => 3,
26            Self::Operator => 4,
27            Self::Punctuation => 5,
28            Self::Comment => 6,
29            Self::BlockComment => 7,
30            Self::Whitespace => 8,
31            Self::Ignore => 9,
32            Self::Other => 10,
33        }
34    }
35}
36
37#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
38pub struct Location {
39    pub line: u32,
40    pub column: u32,
41    pub offset: u32,
42}
43
44#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
45pub struct Token {
46    pub kind: TokenKind,
47    pub value: String,
48    pub start: Location,
49    pub end: Location,
50}
51
52#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
53pub struct BlameEntry {
54    pub commit_sha: String,
55    pub author: String,
56    pub timestamp: i64,
57}
58
59#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
60pub struct Fragment {
61    pub source_id: String,
62    pub start: Location,
63    pub end: Location,
64    pub range: [u32; 2],
65    pub blame: Option<BlameEntry>,
66}
67
68#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
69pub struct CpdClone {
70    pub format: String,
71    pub fragment_a: Fragment,
72    pub fragment_b: Fragment,
73    pub token_count: u32,
74}
75
76/// Internal detection unit — no heap allocation per token.
77///
78/// Produced by the tokenizer's detection path at tokenize time.
79/// `Token` is used for display, blame, and reporter output;
80/// `DetectionToken` is used only during the clone detection hot path.
81/// The token's value string is not stored — only its pre-computed hash.
82#[derive(Debug, Clone, PartialEq, Eq)]
83pub struct DetectionToken {
84    /// Pre-computed hash of (kind, value) — detection never re-hashes.
85    pub hash: u64,
86    pub start: Location,
87    pub end: Location,
88    /// Byte range in the source content: `[start_byte, end_byte]`.
89    pub range: [usize; 2],
90}
91
92/// A source file with pre-tokenized tokens, ready for clone detection.
93#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
94pub struct SourceFile {
95    pub id: String,
96    pub format: String,
97    pub tokens: Vec<Token>,
98}
99
100/// Per-format or total statistics row.
101#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
102#[serde(rename_all = "camelCase")]
103pub struct StatRow {
104    pub lines: u64,
105    pub tokens: u64,
106    pub sources: u64,
107    pub clones: u64,
108    pub duplicated_lines: u64,
109    pub duplicated_tokens: u64,
110    pub percentage: f64,
111    pub percentage_tokens: f64,
112    #[serde(default)]
113    pub new_duplicated_lines: u64,
114    #[serde(default)]
115    pub new_clones: u64,
116}
117
118/// Aggregated detection statistics.
119#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
120#[serde(rename_all = "camelCase")]
121pub struct Statistics {
122    pub total: StatRow,
123    pub formats: HashMap<String, StatRow>,
124    pub detection_date: String,
125}
126
127#[cfg(test)]
128mod tests {
129    use super::*;
130    use serde_json;
131
132    #[test]
133    fn statistics_default_total_is_zero() {
134        let stats = Statistics {
135            total: StatRow {
136                lines: 0,
137                tokens: 0,
138                sources: 0,
139                clones: 0,
140                duplicated_lines: 0,
141                duplicated_tokens: 0,
142                percentage: 0.0,
143                percentage_tokens: 0.0,
144                new_duplicated_lines: 0,
145                new_clones: 0,
146            },
147            formats: HashMap::new(),
148            detection_date: "2026-01-01T00:00:00Z".to_string(),
149        };
150        assert_eq!(stats.total.clones, 0);
151    }
152
153    #[test]
154    fn token_serializes_and_deserializes() {
155        let token = Token {
156            kind: TokenKind::Keyword,
157            value: "function".to_string(),
158            start: Location {
159                line: 1,
160                column: 0,
161                offset: 0,
162            },
163            end: Location {
164                line: 1,
165                column: 8,
166                offset: 8,
167            },
168        };
169        let json = serde_json::to_string(&token).unwrap();
170        let back: Token = serde_json::from_str(&json).unwrap();
171        assert_eq!(token, back);
172    }
173
174    #[test]
175    fn cpd_clone_serializes_with_blame() {
176        let loc = Location {
177            line: 1,
178            column: 0,
179            offset: 0,
180        };
181        let blame = BlameEntry {
182            commit_sha: "abc123".to_string(),
183            author: "Alice".to_string(),
184            timestamp: 1700000000,
185        };
186        let frag = Fragment {
187            source_id: "a.js".to_string(),
188            start: loc.clone(),
189            end: loc.clone(),
190            range: [0, 10],
191            blame: Some(blame),
192        };
193        let clone = CpdClone {
194            format: "javascript".to_string(),
195            fragment_a: frag.clone(),
196            fragment_b: frag,
197            token_count: 50,
198        };
199        let json = serde_json::to_string(&clone).unwrap();
200        assert!(json.contains("abc123"));
201        assert!(json.contains("fragment_a"));
202    }
203
204    #[test]
205    fn fragment_blame_none_serializes_as_null() {
206        let loc = Location {
207            line: 1,
208            column: 0,
209            offset: 0,
210        };
211        let frag = Fragment {
212            source_id: "b.js".to_string(),
213            start: loc.clone(),
214            end: loc.clone(),
215            range: [0, 5],
216            blame: None,
217        };
218        let json = serde_json::to_string(&frag).unwrap();
219        assert!(json.contains("\"blame\":null"));
220    }
221}