Skip to main content

provenant/copyright/
types.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Core types for copyright detection.
5//!
6//! This module defines:
7//! - Detection result types ([`CopyrightDetection`], [`HolderDetection`], [`AuthorDetection`])
8//! - The POS tag enum ([`PosTag`]) with 55 variants for token classification
9//! - Parse tree types ([`ParseNode`], [`TreeLabel`]) for grammar-based extraction
10//! - The [`Token`] struct linking text values to POS tags and source locations
11
12use crate::models::LineNumber;
13
14#[derive(Debug, Clone, PartialEq)]
15pub struct CopyrightDetection {
16    pub copyright: String,
17    pub start_line: LineNumber,
18    pub end_line: LineNumber,
19}
20
21#[derive(Debug, Clone, PartialEq)]
22pub struct HolderDetection {
23    pub holder: String,
24    pub start_line: LineNumber,
25    pub end_line: LineNumber,
26}
27
28#[derive(Debug, Clone, PartialEq)]
29pub struct AuthorDetection {
30    pub author: String,
31    pub start_line: LineNumber,
32    pub end_line: LineNumber,
33}
34
35/// Part-of-Speech tag for a token (type-safe, not stringly-typed)
36#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
37pub enum PosTag {
38    // Copyright keywords
39    Copy,        // "Copyright", "(c)", "Copr.", etc.
40    SpdxContrib, // "SPDX-FileContributor"
41
42    // Year-related
43    Yr,     // A year like "2024"
44    YrPlus, // Year with plus: "2024+"
45    BareYr, // Short year: "99"
46
47    // Names and entities
48    Nnp,      // Proper noun: "John", "Smith"
49    Nn,       // Common noun (catch-all)
50    Caps,     // All-caps word: "MIT", "IBM"
51    Pn,       // Dotted name: "P.", "DMTF."
52    MixedCap, // Mixed case: "LeGrande"
53
54    // Organization suffixes
55    Comp, // Company suffix: "Inc.", "Ltd.", "GmbH"
56    Uni,  // University: "University", "College"
57
58    // Author keywords
59    Auth,         // "Author", "@author"
60    Auth2,        // "Written", "Developed", "Created"
61    Auths,        // "Authors", "author's"
62    AuthDot,      // "Author.", "Authors."
63    Maint,        // "Maintainer", "Developer"
64    Contributors, // "Contributors"
65    Commit,       // "Committers"
66
67    // Rights reserved
68    Right,    // "Rights", "Rechte", "Droits"
69    Reserved, // "Reserved", "Vorbehalten", "Réservés"
70
71    // Conjunctions and prepositions
72    Cc,   // "and", "&", ","
73    Of,   // "of", "De", "Di"
74    By,   // "by"
75    In,   // "in", "en"
76    Van,  // "van", "von", "de", "du"
77    To,   // "to"
78    Dash, // "-", "--", "/"
79
80    // Special
81    Email,      // Email address
82    EmailStart, // Email opening bracket like "<foo"
83    EmailEnd,   // Email closing bracket like "bar>"
84    Url,        // URL with scheme
85    Url2,       // URL without scheme (domain.com)
86    Holder,     // "Holder", "Holders"
87    Is,         // "is", "are"
88    Held,       // "held"
89    Notice,     // "NOTICE"
90    Portions,   // "Portions", "Parts"
91    Oth,        // "Others", "et al."
92    Following,  // "following"
93    Mit,        // "MIT" (special handling)
94    Linux,      // "Linux"
95    Parens,     // "(" or ")"
96    At,         // "AT" (obfuscated email)
97    Dot,        // "DOT" (obfuscated email)
98    Ou,         // "OU" (org unit in certs)
99
100    // Structural
101    EmptyLine, // Empty line marker
102    Junk,      // Junk to ignore
103
104    // Cardinals
105    Cd,    // Cardinal number
106    Cds,   // Small cardinal (0-39)
107    Month, // Month abbreviation
108    Day,   // Day of week
109}
110
111#[derive(Debug, Clone)]
112pub struct Token {
113    pub value: String,
114    pub tag: PosTag,
115    pub start_line: LineNumber,
116}
117
118/// A node in the parse tree
119#[derive(Debug, Clone)]
120pub enum ParseNode {
121    Leaf(Token),
122    Tree {
123        label: TreeLabel,
124        children: Vec<ParseNode>,
125    },
126}
127
128/// Labels for parse tree nodes (grammar non-terminals)
129#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
130pub enum TreeLabel {
131    YrRange,
132    YrAnd,
133    AllRightReserved,
134    Name,
135    NameEmail,
136    NameYear,
137    NameCopy,
138    NameCaps,
139    Company,
140    AndCo,
141    Copyright,
142    Copyright2,
143    Author,
144    AndAuth,
145    InitialDev,
146    DashCaps,
147}
148
149impl ParseNode {
150    /// Get the tag of this node (for leaf tokens) or None (for trees)
151    pub fn tag(&self) -> Option<PosTag> {
152        match self {
153            ParseNode::Leaf(token) => Some(token.tag),
154            ParseNode::Tree { .. } => None,
155        }
156    }
157
158    /// Get the label of this node (for trees) or None (for leaf tokens)
159    pub fn label(&self) -> Option<TreeLabel> {
160        match self {
161            ParseNode::Tree { label, .. } => Some(*label),
162            ParseNode::Leaf(_) => None,
163        }
164    }
165}
166
167#[cfg(test)]
168mod tests {
169    use super::*;
170
171    #[test]
172    fn test_copyright_detection_creation() {
173        let d = CopyrightDetection {
174            copyright: "Copyright 2024 Acme Inc.".to_string(),
175            start_line: LineNumber::ONE,
176            end_line: LineNumber::ONE,
177        };
178        assert_eq!(d.copyright, "Copyright 2024 Acme Inc.");
179    }
180
181    #[test]
182    fn test_token_creation() {
183        let t = Token {
184            value: "Copyright".to_string(),
185            tag: PosTag::Copy,
186            start_line: LineNumber::ONE,
187        };
188        assert_eq!(t.tag, PosTag::Copy);
189    }
190
191    #[test]
192    fn test_parse_node_leaf() {
193        let node = ParseNode::Leaf(Token {
194            value: "2024".to_string(),
195            tag: PosTag::Yr,
196            start_line: LineNumber::new(5).unwrap(),
197        });
198        assert_eq!(node.tag(), Some(PosTag::Yr));
199        assert_eq!(node.label(), None);
200    }
201
202    #[test]
203    fn test_parse_node_tree() {
204        let child = ParseNode::Leaf(Token {
205            value: "2024".to_string(),
206            tag: PosTag::Yr,
207            start_line: LineNumber::new(3).unwrap(),
208        });
209        let tree = ParseNode::Tree {
210            label: TreeLabel::YrRange,
211            children: vec![child],
212        };
213        assert_eq!(tree.label(), Some(TreeLabel::YrRange));
214        assert_eq!(tree.tag(), None);
215    }
216}