Skip to main content

provenant/copyright/
types.rs

1//! Core types for copyright detection.
2//!
3//! This module defines:
4//! - Detection result types ([`CopyrightDetection`], [`HolderDetection`], [`AuthorDetection`])
5//! - The POS tag enum ([`PosTag`]) with 55 variants for token classification
6//! - Parse tree types ([`ParseNode`], [`TreeLabel`]) for grammar-based extraction
7//! - The [`Token`] struct linking text values to POS tags and source locations
8
9use serde::Serialize;
10
11/// A detected copyright statement with source location.
12#[derive(Debug, Clone, PartialEq, Serialize)]
13pub struct CopyrightDetection {
14    /// The full copyright text (e.g., "Copyright 2024 Acme Inc.").
15    pub copyright: String,
16    /// 1-based line number where this detection starts.
17    pub start_line: usize,
18    /// 1-based line number where this detection ends.
19    pub end_line: usize,
20}
21
22/// A detected copyright holder name with source location.
23#[derive(Debug, Clone, PartialEq, Serialize)]
24pub struct HolderDetection {
25    /// The holder name (e.g., "Acme Inc.").
26    pub holder: String,
27    /// 1-based line number where this detection starts.
28    pub start_line: usize,
29    /// 1-based line number where this detection ends.
30    pub end_line: usize,
31}
32
33/// A detected author name with source location.
34#[derive(Debug, Clone, PartialEq, Serialize)]
35pub struct AuthorDetection {
36    /// The author name (e.g., "John Doe").
37    pub author: String,
38    /// 1-based line number where this detection starts.
39    pub start_line: usize,
40    /// 1-based line number where this detection ends.
41    pub end_line: usize,
42}
43
44/// Part-of-Speech tag for a token (type-safe, not stringly-typed)
45#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
46pub enum PosTag {
47    // Copyright keywords
48    Copy,        // "Copyright", "(c)", "Copr.", etc.
49    SpdxContrib, // "SPDX-FileContributor"
50
51    // Year-related
52    Yr,     // A year like "2024"
53    YrPlus, // Year with plus: "2024+"
54    BareYr, // Short year: "99"
55
56    // Names and entities
57    Nnp,      // Proper noun: "John", "Smith"
58    Nn,       // Common noun (catch-all)
59    Caps,     // All-caps word: "MIT", "IBM"
60    Pn,       // Dotted name: "P.", "DMTF."
61    MixedCap, // Mixed case: "LeGrande"
62
63    // Organization suffixes
64    Comp, // Company suffix: "Inc.", "Ltd.", "GmbH"
65    Uni,  // University: "University", "College"
66
67    // Author keywords
68    Auth,         // "Author", "@author"
69    Auth2,        // "Written", "Developed", "Created"
70    Auths,        // "Authors", "author's"
71    AuthDot,      // "Author.", "Authors."
72    Maint,        // "Maintainer", "Developer"
73    Contributors, // "Contributors"
74    Commit,       // "Committers"
75
76    // Rights reserved
77    Right,    // "Rights", "Rechte", "Droits"
78    Reserved, // "Reserved", "Vorbehalten", "Réservés"
79
80    // Conjunctions and prepositions
81    Cc,   // "and", "&", ","
82    Of,   // "of", "De", "Di"
83    By,   // "by"
84    In,   // "in", "en"
85    Van,  // "van", "von", "de", "du"
86    To,   // "to"
87    Dash, // "-", "--", "/"
88
89    // Special
90    Email,      // Email address
91    EmailStart, // Email opening bracket like "<foo"
92    EmailEnd,   // Email closing bracket like "bar>"
93    Url,        // URL with scheme
94    Url2,       // URL without scheme (domain.com)
95    Holder,     // "Holder", "Holders"
96    Is,         // "is", "are"
97    Held,       // "held"
98    Notice,     // "NOTICE"
99    Portions,   // "Portions", "Parts"
100    Oth,        // "Others", "et al."
101    Following,  // "following"
102    Mit,        // "MIT" (special handling)
103    Linux,      // "Linux"
104    Parens,     // "(" or ")"
105    At,         // "AT" (obfuscated email)
106    Dot,        // "DOT" (obfuscated email)
107    Ou,         // "OU" (org unit in certs)
108
109    // Structural
110    EmptyLine, // Empty line marker
111    Junk,      // Junk to ignore
112
113    // Cardinals
114    Cd,    // Cardinal number
115    Cds,   // Small cardinal (0-39)
116    Month, // Month abbreviation
117    Day,   // Day of week
118}
119
120/// A token with its POS tag and source location.
121#[derive(Debug, Clone)]
122pub struct Token {
123    /// The token text (e.g., "Copyright", "2024", "Acme").
124    pub value: String,
125    /// The assigned POS tag.
126    pub tag: PosTag,
127    /// 1-based source line number.
128    pub start_line: usize,
129}
130
131/// A node in the parse tree
132#[derive(Debug, Clone)]
133pub enum ParseNode {
134    Leaf(Token),
135    Tree {
136        label: TreeLabel,
137        children: Vec<ParseNode>,
138    },
139}
140
141/// Labels for parse tree nodes (grammar non-terminals)
142#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
143pub enum TreeLabel {
144    YrRange,
145    YrAnd,
146    AllRightReserved,
147    Name,
148    NameEmail,
149    NameYear,
150    NameCopy,
151    NameCaps,
152    Company,
153    AndCo,
154    Copyright,
155    Copyright2,
156    Author,
157    AndAuth,
158    InitialDev,
159    DashCaps,
160}
161
162impl ParseNode {
163    /// Get the tag of this node (for leaf tokens) or None (for trees)
164    pub fn tag(&self) -> Option<PosTag> {
165        match self {
166            ParseNode::Leaf(token) => Some(token.tag),
167            ParseNode::Tree { .. } => None,
168        }
169    }
170
171    /// Get the label of this node (for trees) or None (for leaf tokens)
172    pub fn label(&self) -> Option<TreeLabel> {
173        match self {
174            ParseNode::Tree { label, .. } => Some(*label),
175            ParseNode::Leaf(_) => None,
176        }
177    }
178}
179
180#[cfg(test)]
181mod tests {
182    use super::*;
183
184    #[test]
185    fn test_copyright_detection_creation() {
186        let d = CopyrightDetection {
187            copyright: "Copyright 2024 Acme Inc.".to_string(),
188            start_line: 1,
189            end_line: 1,
190        };
191        assert_eq!(d.copyright, "Copyright 2024 Acme Inc.");
192    }
193
194    #[test]
195    fn test_token_creation() {
196        let t = Token {
197            value: "Copyright".to_string(),
198            tag: PosTag::Copy,
199            start_line: 1,
200        };
201        assert_eq!(t.tag, PosTag::Copy);
202    }
203
204    #[test]
205    fn test_parse_node_leaf() {
206        let node = ParseNode::Leaf(Token {
207            value: "2024".to_string(),
208            tag: PosTag::Yr,
209            start_line: 5,
210        });
211        assert_eq!(node.tag(), Some(PosTag::Yr));
212        assert_eq!(node.label(), None);
213    }
214
215    #[test]
216    fn test_parse_node_tree() {
217        let child = ParseNode::Leaf(Token {
218            value: "2024".to_string(),
219            tag: PosTag::Yr,
220            start_line: 3,
221        });
222        let tree = ParseNode::Tree {
223            label: TreeLabel::YrRange,
224            children: vec![child],
225        };
226        assert_eq!(tree.label(), Some(TreeLabel::YrRange));
227        assert_eq!(tree.tag(), None);
228    }
229}