Skip to main content

cpd_core/
models.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3
4#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
5#[serde(rename_all = "snake_case")]
6pub enum TokenKind {
7    Keyword,
8    Identifier,
9    Literal,
10    Operator,
11    Punctuation,
12    Comment,
13    BlockComment,
14    Whitespace,
15    Ignore,
16    Other,
17}
18
19impl TokenKind {
20    /// Return a stable byte discriminant for use in token hashing.
21    pub fn discriminant(&self) -> u8 {
22        match self {
23            Self::Keyword => 1,
24            Self::Identifier => 2,
25            Self::Literal => 3,
26            Self::Operator => 4,
27            Self::Punctuation => 5,
28            Self::Comment => 6,
29            Self::BlockComment => 7,
30            Self::Whitespace => 8,
31            Self::Ignore => 9,
32            Self::Other => 10,
33        }
34    }
35}
36
37#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
38pub struct Location {
39    pub line: u32,
40    pub column: u32,
41    pub offset: u32,
42}
43
44#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
45pub struct Token {
46    pub kind: TokenKind,
47    pub value: String,
48    pub start: Location,
49    pub end: Location,
50}
51
52#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
53pub struct BlameEntry {
54    pub commit_sha: String,
55    pub author: String,
56    pub timestamp: i64,
57}
58
59#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
60pub struct Fragment {
61    pub source_id: String,
62    pub start: Location,
63    pub end: Location,
64    pub range: [u32; 2],
65    pub blame: Option<BlameEntry>,
66}
67
68#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
69pub struct CpdClone {
70    pub format: String,
71    pub fragment_a: Fragment,
72    pub fragment_b: Fragment,
73    pub token_count: u32,
74}
75
76/// Internal detection unit — no heap allocation per token.
77///
78/// Produced by the tokenizer's detection path at tokenize time.
79/// `Token` is used for display, blame, and reporter output;
80/// `DetectionToken` is used only during the clone detection hot path.
81/// The token's value string is not stored — only its pre-computed hash.
82#[derive(Debug, Clone, PartialEq, Eq)]
83pub struct DetectionToken {
84    /// Pre-computed hash of (kind, value) — detection never re-hashes.
85    pub hash: u64,
86    pub start: Location,
87    pub end: Location,
88    /// Byte range in the source content: `[start_byte, end_byte]`.
89    pub range: [usize; 2],
90}
91
92/// A source file with pre-tokenized tokens, ready for clone detection.
93#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
94pub struct SourceFile {
95    pub id: String,
96    pub format: String,
97    pub tokens: Vec<Token>,
98}
99
100/// Per-format or total statistics row.
101#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
102pub struct StatRow {
103    pub lines: u64,
104    pub tokens: u64,
105    pub sources: u64,
106    pub clones: u64,
107    pub duplicated_lines: u64,
108    pub duplicated_tokens: u64,
109    pub percentage: f64,
110    pub percentage_tokens: f64,
111}
112
113/// Aggregated detection statistics.
114#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
115pub struct Statistics {
116    pub total: StatRow,
117    pub formats: HashMap<String, StatRow>,
118    pub detection_date: String,
119}
120
121#[cfg(test)]
122mod tests {
123    use super::*;
124    use serde_json;
125
126    #[test]
127    fn statistics_default_total_is_zero() {
128        let stats = Statistics {
129            total: StatRow {
130                lines: 0,
131                tokens: 0,
132                sources: 0,
133                clones: 0,
134                duplicated_lines: 0,
135                duplicated_tokens: 0,
136                percentage: 0.0,
137                percentage_tokens: 0.0,
138            },
139            formats: HashMap::new(),
140            detection_date: "2026-01-01T00:00:00Z".to_string(),
141        };
142        assert_eq!(stats.total.clones, 0);
143    }
144
145    #[test]
146    fn token_serializes_and_deserializes() {
147        let token = Token {
148            kind: TokenKind::Keyword,
149            value: "function".to_string(),
150            start: Location {
151                line: 1,
152                column: 0,
153                offset: 0,
154            },
155            end: Location {
156                line: 1,
157                column: 8,
158                offset: 8,
159            },
160        };
161        let json = serde_json::to_string(&token).unwrap();
162        let back: Token = serde_json::from_str(&json).unwrap();
163        assert_eq!(token, back);
164    }
165
166    #[test]
167    fn cpd_clone_serializes_with_blame() {
168        let loc = Location {
169            line: 1,
170            column: 0,
171            offset: 0,
172        };
173        let blame = BlameEntry {
174            commit_sha: "abc123".to_string(),
175            author: "Alice".to_string(),
176            timestamp: 1700000000,
177        };
178        let frag = Fragment {
179            source_id: "a.js".to_string(),
180            start: loc.clone(),
181            end: loc.clone(),
182            range: [0, 10],
183            blame: Some(blame),
184        };
185        let clone = CpdClone {
186            format: "javascript".to_string(),
187            fragment_a: frag.clone(),
188            fragment_b: frag,
189            token_count: 50,
190        };
191        let json = serde_json::to_string(&clone).unwrap();
192        assert!(json.contains("abc123"));
193        assert!(json.contains("fragment_a"));
194    }
195
196    #[test]
197    fn fragment_blame_none_serializes_as_null() {
198        let loc = Location {
199            line: 1,
200            column: 0,
201            offset: 0,
202        };
203        let frag = Fragment {
204            source_id: "b.js".to_string(),
205            start: loc.clone(),
206            end: loc.clone(),
207            range: [0, 5],
208            blame: None,
209        };
210        let json = serde_json::to_string(&frag).unwrap();
211        assert!(json.contains("\"blame\":null"));
212    }
213}