Skip to main content

provenant/copyright/
types.rs

1//! Core types for copyright detection.
2//!
3//! This module defines:
4//! - Detection result types ([`CopyrightDetection`], [`HolderDetection`], [`AuthorDetection`])
5//! - The POS tag enum ([`PosTag`]) with 55 variants for token classification
6//! - Parse tree types ([`ParseNode`], [`TreeLabel`]) for grammar-based extraction
7//! - The [`Token`] struct linking text values to POS tags and source locations
8
9use crate::models::LineNumber;
10
11#[derive(Debug, Clone, PartialEq)]
12pub struct CopyrightDetection {
13    pub copyright: String,
14    pub start_line: LineNumber,
15    pub end_line: LineNumber,
16}
17
18#[derive(Debug, Clone, PartialEq)]
19pub struct HolderDetection {
20    pub holder: String,
21    pub start_line: LineNumber,
22    pub end_line: LineNumber,
23}
24
25#[derive(Debug, Clone, PartialEq)]
26pub struct AuthorDetection {
27    pub author: String,
28    pub start_line: LineNumber,
29    pub end_line: LineNumber,
30}
31
32/// Part-of-Speech tag for a token (type-safe, not stringly-typed)
33#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
34pub enum PosTag {
35    // Copyright keywords
36    Copy,        // "Copyright", "(c)", "Copr.", etc.
37    SpdxContrib, // "SPDX-FileContributor"
38
39    // Year-related
40    Yr,     // A year like "2024"
41    YrPlus, // Year with plus: "2024+"
42    BareYr, // Short year: "99"
43
44    // Names and entities
45    Nnp,      // Proper noun: "John", "Smith"
46    Nn,       // Common noun (catch-all)
47    Caps,     // All-caps word: "MIT", "IBM"
48    Pn,       // Dotted name: "P.", "DMTF."
49    MixedCap, // Mixed case: "LeGrande"
50
51    // Organization suffixes
52    Comp, // Company suffix: "Inc.", "Ltd.", "GmbH"
53    Uni,  // University: "University", "College"
54
55    // Author keywords
56    Auth,         // "Author", "@author"
57    Auth2,        // "Written", "Developed", "Created"
58    Auths,        // "Authors", "author's"
59    AuthDot,      // "Author.", "Authors."
60    Maint,        // "Maintainer", "Developer"
61    Contributors, // "Contributors"
62    Commit,       // "Committers"
63
64    // Rights reserved
65    Right,    // "Rights", "Rechte", "Droits"
66    Reserved, // "Reserved", "Vorbehalten", "Réservés"
67
68    // Conjunctions and prepositions
69    Cc,   // "and", "&", ","
70    Of,   // "of", "De", "Di"
71    By,   // "by"
72    In,   // "in", "en"
73    Van,  // "van", "von", "de", "du"
74    To,   // "to"
75    Dash, // "-", "--", "/"
76
77    // Special
78    Email,      // Email address
79    EmailStart, // Email opening bracket like "<foo"
80    EmailEnd,   // Email closing bracket like "bar>"
81    Url,        // URL with scheme
82    Url2,       // URL without scheme (domain.com)
83    Holder,     // "Holder", "Holders"
84    Is,         // "is", "are"
85    Held,       // "held"
86    Notice,     // "NOTICE"
87    Portions,   // "Portions", "Parts"
88    Oth,        // "Others", "et al."
89    Following,  // "following"
90    Mit,        // "MIT" (special handling)
91    Linux,      // "Linux"
92    Parens,     // "(" or ")"
93    At,         // "AT" (obfuscated email)
94    Dot,        // "DOT" (obfuscated email)
95    Ou,         // "OU" (org unit in certs)
96
97    // Structural
98    EmptyLine, // Empty line marker
99    Junk,      // Junk to ignore
100
101    // Cardinals
102    Cd,    // Cardinal number
103    Cds,   // Small cardinal (0-39)
104    Month, // Month abbreviation
105    Day,   // Day of week
106}
107
108#[derive(Debug, Clone)]
109pub struct Token {
110    pub value: String,
111    pub tag: PosTag,
112    pub start_line: LineNumber,
113}
114
115/// A node in the parse tree
116#[derive(Debug, Clone)]
117pub enum ParseNode {
118    Leaf(Token),
119    Tree {
120        label: TreeLabel,
121        children: Vec<ParseNode>,
122    },
123}
124
125/// Labels for parse tree nodes (grammar non-terminals)
126#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
127pub enum TreeLabel {
128    YrRange,
129    YrAnd,
130    AllRightReserved,
131    Name,
132    NameEmail,
133    NameYear,
134    NameCopy,
135    NameCaps,
136    Company,
137    AndCo,
138    Copyright,
139    Copyright2,
140    Author,
141    AndAuth,
142    InitialDev,
143    DashCaps,
144}
145
146impl ParseNode {
147    /// Get the tag of this node (for leaf tokens) or None (for trees)
148    pub fn tag(&self) -> Option<PosTag> {
149        match self {
150            ParseNode::Leaf(token) => Some(token.tag),
151            ParseNode::Tree { .. } => None,
152        }
153    }
154
155    /// Get the label of this node (for trees) or None (for leaf tokens)
156    pub fn label(&self) -> Option<TreeLabel> {
157        match self {
158            ParseNode::Tree { label, .. } => Some(*label),
159            ParseNode::Leaf(_) => None,
160        }
161    }
162}
163
164#[cfg(test)]
165mod tests {
166    use super::*;
167
168    #[test]
169    fn test_copyright_detection_creation() {
170        let d = CopyrightDetection {
171            copyright: "Copyright 2024 Acme Inc.".to_string(),
172            start_line: LineNumber::ONE,
173            end_line: LineNumber::ONE,
174        };
175        assert_eq!(d.copyright, "Copyright 2024 Acme Inc.");
176    }
177
178    #[test]
179    fn test_token_creation() {
180        let t = Token {
181            value: "Copyright".to_string(),
182            tag: PosTag::Copy,
183            start_line: LineNumber::ONE,
184        };
185        assert_eq!(t.tag, PosTag::Copy);
186    }
187
188    #[test]
189    fn test_parse_node_leaf() {
190        let node = ParseNode::Leaf(Token {
191            value: "2024".to_string(),
192            tag: PosTag::Yr,
193            start_line: LineNumber::new(5).unwrap(),
194        });
195        assert_eq!(node.tag(), Some(PosTag::Yr));
196        assert_eq!(node.label(), None);
197    }
198
199    #[test]
200    fn test_parse_node_tree() {
201        let child = ParseNode::Leaf(Token {
202            value: "2024".to_string(),
203            tag: PosTag::Yr,
204            start_line: LineNumber::new(3).unwrap(),
205        });
206        let tree = ParseNode::Tree {
207            label: TreeLabel::YrRange,
208            children: vec![child],
209        };
210        assert_eq!(tree.label(), Some(TreeLabel::YrRange));
211        assert_eq!(tree.tag(), None);
212    }
213}