provenant/copyright/types.rs
1//! Core types for copyright detection.
2//!
3//! This module defines:
4//! - Detection result types ([`CopyrightDetection`], [`HolderDetection`], [`AuthorDetection`])
5//! - The POS tag enum ([`PosTag`]) with 55 variants for token classification
6//! - Parse tree types ([`ParseNode`], [`TreeLabel`]) for grammar-based extraction
7//! - The [`Token`] struct linking text values to POS tags and source locations
8
9use crate::models::LineNumber;
10
11#[derive(Debug, Clone, PartialEq)]
12pub struct CopyrightDetection {
13 pub copyright: String,
14 pub start_line: LineNumber,
15 pub end_line: LineNumber,
16}
17
18#[derive(Debug, Clone, PartialEq)]
19pub struct HolderDetection {
20 pub holder: String,
21 pub start_line: LineNumber,
22 pub end_line: LineNumber,
23}
24
25#[derive(Debug, Clone, PartialEq)]
26pub struct AuthorDetection {
27 pub author: String,
28 pub start_line: LineNumber,
29 pub end_line: LineNumber,
30}
31
32/// Part-of-Speech tag for a token (type-safe, not stringly-typed)
33#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
34pub enum PosTag {
35 // Copyright keywords
36 Copy, // "Copyright", "(c)", "Copr.", etc.
37 SpdxContrib, // "SPDX-FileContributor"
38
39 // Year-related
40 Yr, // A year like "2024"
41 YrPlus, // Year with plus: "2024+"
42 BareYr, // Short year: "99"
43
44 // Names and entities
45 Nnp, // Proper noun: "John", "Smith"
46 Nn, // Common noun (catch-all)
47 Caps, // All-caps word: "MIT", "IBM"
48 Pn, // Dotted name: "P.", "DMTF."
49 MixedCap, // Mixed case: "LeGrande"
50
51 // Organization suffixes
52 Comp, // Company suffix: "Inc.", "Ltd.", "GmbH"
53 Uni, // University: "University", "College"
54
55 // Author keywords
56 Auth, // "Author", "@author"
57 Auth2, // "Written", "Developed", "Created"
58 Auths, // "Authors", "author's"
59 AuthDot, // "Author.", "Authors."
60 Maint, // "Maintainer", "Developer"
61 Contributors, // "Contributors"
62 Commit, // "Committers"
63
64 // Rights reserved
65 Right, // "Rights", "Rechte", "Droits"
66 Reserved, // "Reserved", "Vorbehalten", "Réservés"
67
68 // Conjunctions and prepositions
69 Cc, // "and", "&", ","
70 Of, // "of", "De", "Di"
71 By, // "by"
72 In, // "in", "en"
73 Van, // "van", "von", "de", "du"
74 To, // "to"
75 Dash, // "-", "--", "/"
76
77 // Special
78 Email, // Email address
79 EmailStart, // Email opening bracket like "<foo"
80 EmailEnd, // Email closing bracket like "bar>"
81 Url, // URL with scheme
82 Url2, // URL without scheme (domain.com)
83 Holder, // "Holder", "Holders"
84 Is, // "is", "are"
85 Held, // "held"
86 Notice, // "NOTICE"
87 Portions, // "Portions", "Parts"
88 Oth, // "Others", "et al."
89 Following, // "following"
90 Mit, // "MIT" (special handling)
91 Linux, // "Linux"
92 Parens, // "(" or ")"
93 At, // "AT" (obfuscated email)
94 Dot, // "DOT" (obfuscated email)
95 Ou, // "OU" (org unit in certs)
96
97 // Structural
98 EmptyLine, // Empty line marker
99 Junk, // Junk to ignore
100
101 // Cardinals
102 Cd, // Cardinal number
103 Cds, // Small cardinal (0-39)
104 Month, // Month abbreviation
105 Day, // Day of week
106}
107
108#[derive(Debug, Clone)]
109pub struct Token {
110 pub value: String,
111 pub tag: PosTag,
112 pub start_line: LineNumber,
113}
114
115/// A node in the parse tree
116#[derive(Debug, Clone)]
117pub enum ParseNode {
118 Leaf(Token),
119 Tree {
120 label: TreeLabel,
121 children: Vec<ParseNode>,
122 },
123}
124
125/// Labels for parse tree nodes (grammar non-terminals)
126#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
127pub enum TreeLabel {
128 YrRange,
129 YrAnd,
130 AllRightReserved,
131 Name,
132 NameEmail,
133 NameYear,
134 NameCopy,
135 NameCaps,
136 Company,
137 AndCo,
138 Copyright,
139 Copyright2,
140 Author,
141 AndAuth,
142 InitialDev,
143 DashCaps,
144}
145
146impl ParseNode {
147 /// Get the tag of this node (for leaf tokens) or None (for trees)
148 pub fn tag(&self) -> Option<PosTag> {
149 match self {
150 ParseNode::Leaf(token) => Some(token.tag),
151 ParseNode::Tree { .. } => None,
152 }
153 }
154
155 /// Get the label of this node (for trees) or None (for leaf tokens)
156 pub fn label(&self) -> Option<TreeLabel> {
157 match self {
158 ParseNode::Tree { label, .. } => Some(*label),
159 ParseNode::Leaf(_) => None,
160 }
161 }
162}
163
164#[cfg(test)]
165mod tests {
166 use super::*;
167
168 #[test]
169 fn test_copyright_detection_creation() {
170 let d = CopyrightDetection {
171 copyright: "Copyright 2024 Acme Inc.".to_string(),
172 start_line: LineNumber::ONE,
173 end_line: LineNumber::ONE,
174 };
175 assert_eq!(d.copyright, "Copyright 2024 Acme Inc.");
176 }
177
178 #[test]
179 fn test_token_creation() {
180 let t = Token {
181 value: "Copyright".to_string(),
182 tag: PosTag::Copy,
183 start_line: LineNumber::ONE,
184 };
185 assert_eq!(t.tag, PosTag::Copy);
186 }
187
188 #[test]
189 fn test_parse_node_leaf() {
190 let node = ParseNode::Leaf(Token {
191 value: "2024".to_string(),
192 tag: PosTag::Yr,
193 start_line: LineNumber::new(5).unwrap(),
194 });
195 assert_eq!(node.tag(), Some(PosTag::Yr));
196 assert_eq!(node.label(), None);
197 }
198
199 #[test]
200 fn test_parse_node_tree() {
201 let child = ParseNode::Leaf(Token {
202 value: "2024".to_string(),
203 tag: PosTag::Yr,
204 start_line: LineNumber::new(3).unwrap(),
205 });
206 let tree = ParseNode::Tree {
207 label: TreeLabel::YrRange,
208 children: vec![child],
209 };
210 assert_eq!(tree.label(), Some(TreeLabel::YrRange));
211 assert_eq!(tree.tag(), None);
212 }
213}