provenant/copyright/types.rs
1//! Core types for copyright detection.
2//!
3//! This module defines:
4//! - Detection result types ([`CopyrightDetection`], [`HolderDetection`], [`AuthorDetection`])
5//! - The POS tag enum ([`PosTag`]) with 55 variants for token classification
6//! - Parse tree types ([`ParseNode`], [`TreeLabel`]) for grammar-based extraction
7//! - The [`Token`] struct linking text values to POS tags and source locations
8
9use serde::Serialize;
10
11/// A detected copyright statement with source location.
12#[derive(Debug, Clone, PartialEq, Serialize)]
13pub struct CopyrightDetection {
14 /// The full copyright text (e.g., "Copyright 2024 Acme Inc.").
15 pub copyright: String,
16 /// 1-based line number where this detection starts.
17 pub start_line: usize,
18 /// 1-based line number where this detection ends.
19 pub end_line: usize,
20}
21
22/// A detected copyright holder name with source location.
23#[derive(Debug, Clone, PartialEq, Serialize)]
24pub struct HolderDetection {
25 /// The holder name (e.g., "Acme Inc.").
26 pub holder: String,
27 /// 1-based line number where this detection starts.
28 pub start_line: usize,
29 /// 1-based line number where this detection ends.
30 pub end_line: usize,
31}
32
33/// A detected author name with source location.
34#[derive(Debug, Clone, PartialEq, Serialize)]
35pub struct AuthorDetection {
36 /// The author name (e.g., "John Doe").
37 pub author: String,
38 /// 1-based line number where this detection starts.
39 pub start_line: usize,
40 /// 1-based line number where this detection ends.
41 pub end_line: usize,
42}
43
44/// Part-of-Speech tag for a token (type-safe, not stringly-typed)
45#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
46pub enum PosTag {
47 // Copyright keywords
48 Copy, // "Copyright", "(c)", "Copr.", etc.
49 SpdxContrib, // "SPDX-FileContributor"
50
51 // Year-related
52 Yr, // A year like "2024"
53 YrPlus, // Year with plus: "2024+"
54 BareYr, // Short year: "99"
55
56 // Names and entities
57 Nnp, // Proper noun: "John", "Smith"
58 Nn, // Common noun (catch-all)
59 Caps, // All-caps word: "MIT", "IBM"
60 Pn, // Dotted name: "P.", "DMTF."
61 MixedCap, // Mixed case: "LeGrande"
62
63 // Organization suffixes
64 Comp, // Company suffix: "Inc.", "Ltd.", "GmbH"
65 Uni, // University: "University", "College"
66
67 // Author keywords
68 Auth, // "Author", "@author"
69 Auth2, // "Written", "Developed", "Created"
70 Auths, // "Authors", "author's"
71 AuthDot, // "Author.", "Authors."
72 Maint, // "Maintainer", "Developer"
73 Contributors, // "Contributors"
74 Commit, // "Committers"
75
76 // Rights reserved
77 Right, // "Rights", "Rechte", "Droits"
78 Reserved, // "Reserved", "Vorbehalten", "Réservés"
79
80 // Conjunctions and prepositions
81 Cc, // "and", "&", ","
82 Of, // "of", "De", "Di"
83 By, // "by"
84 In, // "in", "en"
85 Van, // "van", "von", "de", "du"
86 To, // "to"
87 Dash, // "-", "--", "/"
88
89 // Special
90 Email, // Email address
91 EmailStart, // Email opening bracket like "<foo"
92 EmailEnd, // Email closing bracket like "bar>"
93 Url, // URL with scheme
94 Url2, // URL without scheme (domain.com)
95 Holder, // "Holder", "Holders"
96 Is, // "is", "are"
97 Held, // "held"
98 Notice, // "NOTICE"
99 Portions, // "Portions", "Parts"
100 Oth, // "Others", "et al."
101 Following, // "following"
102 Mit, // "MIT" (special handling)
103 Linux, // "Linux"
104 Parens, // "(" or ")"
105 At, // "AT" (obfuscated email)
106 Dot, // "DOT" (obfuscated email)
107 Ou, // "OU" (org unit in certs)
108
109 // Structural
110 EmptyLine, // Empty line marker
111 Junk, // Junk to ignore
112
113 // Cardinals
114 Cd, // Cardinal number
115 Cds, // Small cardinal (0-39)
116 Month, // Month abbreviation
117 Day, // Day of week
118}
119
120/// A token with its POS tag and source location.
121#[derive(Debug, Clone)]
122pub struct Token {
123 /// The token text (e.g., "Copyright", "2024", "Acme").
124 pub value: String,
125 /// The assigned POS tag.
126 pub tag: PosTag,
127 /// 1-based source line number.
128 pub start_line: usize,
129}
130
131/// A node in the parse tree
132#[derive(Debug, Clone)]
133pub enum ParseNode {
134 Leaf(Token),
135 Tree {
136 label: TreeLabel,
137 children: Vec<ParseNode>,
138 },
139}
140
141/// Labels for parse tree nodes (grammar non-terminals)
142#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
143pub enum TreeLabel {
144 YrRange,
145 YrAnd,
146 AllRightReserved,
147 Name,
148 NameEmail,
149 NameYear,
150 NameCopy,
151 NameCaps,
152 Company,
153 AndCo,
154 Copyright,
155 Copyright2,
156 Author,
157 AndAuth,
158 InitialDev,
159 DashCaps,
160}
161
162impl ParseNode {
163 /// Get the tag of this node (for leaf tokens) or None (for trees)
164 pub fn tag(&self) -> Option<PosTag> {
165 match self {
166 ParseNode::Leaf(token) => Some(token.tag),
167 ParseNode::Tree { .. } => None,
168 }
169 }
170
171 /// Get the label of this node (for trees) or None (for leaf tokens)
172 pub fn label(&self) -> Option<TreeLabel> {
173 match self {
174 ParseNode::Tree { label, .. } => Some(*label),
175 ParseNode::Leaf(_) => None,
176 }
177 }
178}
179
180#[cfg(test)]
181mod tests {
182 use super::*;
183
184 #[test]
185 fn test_copyright_detection_creation() {
186 let d = CopyrightDetection {
187 copyright: "Copyright 2024 Acme Inc.".to_string(),
188 start_line: 1,
189 end_line: 1,
190 };
191 assert_eq!(d.copyright, "Copyright 2024 Acme Inc.");
192 }
193
194 #[test]
195 fn test_token_creation() {
196 let t = Token {
197 value: "Copyright".to_string(),
198 tag: PosTag::Copy,
199 start_line: 1,
200 };
201 assert_eq!(t.tag, PosTag::Copy);
202 }
203
204 #[test]
205 fn test_parse_node_leaf() {
206 let node = ParseNode::Leaf(Token {
207 value: "2024".to_string(),
208 tag: PosTag::Yr,
209 start_line: 5,
210 });
211 assert_eq!(node.tag(), Some(PosTag::Yr));
212 assert_eq!(node.label(), None);
213 }
214
215 #[test]
216 fn test_parse_node_tree() {
217 let child = ParseNode::Leaf(Token {
218 value: "2024".to_string(),
219 tag: PosTag::Yr,
220 start_line: 3,
221 });
222 let tree = ParseNode::Tree {
223 label: TreeLabel::YrRange,
224 children: vec![child],
225 };
226 assert_eq!(tree.label(), Some(TreeLabel::YrRange));
227 assert_eq!(tree.tag(), None);
228 }
229}