provenant/copyright/types.rs
1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Core types for copyright detection.
5//!
6//! This module defines:
7//! - Detection result types ([`CopyrightDetection`], [`HolderDetection`], [`AuthorDetection`])
8//! - The POS tag enum ([`PosTag`]) with 55 variants for token classification
9//! - Parse tree types ([`ParseNode`], [`TreeLabel`]) for grammar-based extraction
10//! - The [`Token`] struct linking text values to POS tags and source locations
11
12use crate::models::LineNumber;
13
14#[derive(Debug, Clone, PartialEq)]
15pub struct CopyrightDetection {
16 pub copyright: String,
17 pub start_line: LineNumber,
18 pub end_line: LineNumber,
19}
20
21#[derive(Debug, Clone, PartialEq)]
22pub struct HolderDetection {
23 pub holder: String,
24 pub start_line: LineNumber,
25 pub end_line: LineNumber,
26}
27
28#[derive(Debug, Clone, PartialEq)]
29pub struct AuthorDetection {
30 pub author: String,
31 pub start_line: LineNumber,
32 pub end_line: LineNumber,
33}
34
35/// Part-of-Speech tag for a token (type-safe, not stringly-typed)
36#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
37pub enum PosTag {
38 // Copyright keywords
39 Copy, // "Copyright", "(c)", "Copr.", etc.
40 SpdxContrib, // "SPDX-FileContributor"
41
42 // Year-related
43 Yr, // A year like "2024"
44 YrPlus, // Year with plus: "2024+"
45 BareYr, // Short year: "99"
46
47 // Names and entities
48 Nnp, // Proper noun: "John", "Smith"
49 Nn, // Common noun (catch-all)
50 Caps, // All-caps word: "MIT", "IBM"
51 Pn, // Dotted name: "P.", "DMTF."
52 MixedCap, // Mixed case: "LeGrande"
53
54 // Organization suffixes
55 Comp, // Company suffix: "Inc.", "Ltd.", "GmbH"
56 Uni, // University: "University", "College"
57
58 // Author keywords
59 Auth, // "Author", "@author"
60 Auth2, // "Written", "Developed", "Created"
61 Auths, // "Authors", "author's"
62 AuthDot, // "Author.", "Authors."
63 Maint, // "Maintainer", "Developer"
64 Contributors, // "Contributors"
65 Commit, // "Committers"
66
67 // Rights reserved
68 Right, // "Rights", "Rechte", "Droits"
69 Reserved, // "Reserved", "Vorbehalten", "Réservés"
70
71 // Conjunctions and prepositions
72 Cc, // "and", "&", ","
73 Of, // "of", "De", "Di"
74 By, // "by"
75 In, // "in", "en"
76 Van, // "van", "von", "de", "du"
77 To, // "to"
78 Dash, // "-", "--", "/"
79
80 // Special
81 Email, // Email address
82 EmailStart, // Email opening bracket like "<foo"
83 EmailEnd, // Email closing bracket like "bar>"
84 Url, // URL with scheme
85 Url2, // URL without scheme (domain.com)
86 Holder, // "Holder", "Holders"
87 Is, // "is", "are"
88 Held, // "held"
89 Notice, // "NOTICE"
90 Portions, // "Portions", "Parts"
91 Oth, // "Others", "et al."
92 Following, // "following"
93 Mit, // "MIT" (special handling)
94 Linux, // "Linux"
95 Parens, // "(" or ")"
96 At, // "AT" (obfuscated email)
97 Dot, // "DOT" (obfuscated email)
98 Ou, // "OU" (org unit in certs)
99
100 // Structural
101 EmptyLine, // Empty line marker
102 Junk, // Junk to ignore
103
104 // Cardinals
105 Cd, // Cardinal number
106 Cds, // Small cardinal (0-39)
107 Month, // Month abbreviation
108 Day, // Day of week
109}
110
111#[derive(Debug, Clone)]
112pub struct Token {
113 pub value: String,
114 pub tag: PosTag,
115 pub start_line: LineNumber,
116}
117
118/// A node in the parse tree
119#[derive(Debug, Clone)]
120pub enum ParseNode {
121 Leaf(Token),
122 Tree {
123 label: TreeLabel,
124 children: Vec<ParseNode>,
125 },
126}
127
128/// Labels for parse tree nodes (grammar non-terminals)
129#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
130pub enum TreeLabel {
131 YrRange,
132 YrAnd,
133 AllRightReserved,
134 Name,
135 NameEmail,
136 NameYear,
137 NameCopy,
138 NameCaps,
139 Company,
140 AndCo,
141 Copyright,
142 Copyright2,
143 Author,
144 AndAuth,
145 InitialDev,
146 DashCaps,
147}
148
149impl ParseNode {
150 /// Get the tag of this node (for leaf tokens) or None (for trees)
151 pub fn tag(&self) -> Option<PosTag> {
152 match self {
153 ParseNode::Leaf(token) => Some(token.tag),
154 ParseNode::Tree { .. } => None,
155 }
156 }
157
158 /// Get the label of this node (for trees) or None (for leaf tokens)
159 pub fn label(&self) -> Option<TreeLabel> {
160 match self {
161 ParseNode::Tree { label, .. } => Some(*label),
162 ParseNode::Leaf(_) => None,
163 }
164 }
165}
166
167#[cfg(test)]
168mod tests {
169 use super::*;
170
171 #[test]
172 fn test_copyright_detection_creation() {
173 let d = CopyrightDetection {
174 copyright: "Copyright 2024 Acme Inc.".to_string(),
175 start_line: LineNumber::ONE,
176 end_line: LineNumber::ONE,
177 };
178 assert_eq!(d.copyright, "Copyright 2024 Acme Inc.");
179 }
180
181 #[test]
182 fn test_token_creation() {
183 let t = Token {
184 value: "Copyright".to_string(),
185 tag: PosTag::Copy,
186 start_line: LineNumber::ONE,
187 };
188 assert_eq!(t.tag, PosTag::Copy);
189 }
190
191 #[test]
192 fn test_parse_node_leaf() {
193 let node = ParseNode::Leaf(Token {
194 value: "2024".to_string(),
195 tag: PosTag::Yr,
196 start_line: LineNumber::new(5).unwrap(),
197 });
198 assert_eq!(node.tag(), Some(PosTag::Yr));
199 assert_eq!(node.label(), None);
200 }
201
202 #[test]
203 fn test_parse_node_tree() {
204 let child = ParseNode::Leaf(Token {
205 value: "2024".to_string(),
206 tag: PosTag::Yr,
207 start_line: LineNumber::new(3).unwrap(),
208 });
209 let tree = ParseNode::Tree {
210 label: TreeLabel::YrRange,
211 children: vec![child],
212 };
213 assert_eq!(tree.label(), Some(TreeLabel::YrRange));
214 assert_eq!(tree.tag(), None);
215 }
216}