Skip to main content

provenant/license_detection/rules/
loader.rs

1//! Parse .LICENSE and .RULE files.
2//!
3//! This module provides two-stage loading:
4//! 1. Loader-stage: Parse files into `LoadedRule` and `LoadedLicense`
5//! 2. Build-stage: Convert to runtime `Rule` and `License` (deprecated filtering, etc.)
6//!
7//! The loader-stage functions (`parse_rule_to_loaded`, `parse_license_to_loaded`,
8//! `load_loaded_rules_from_directory`, `load_loaded_licenses_from_directory`) return
9//! all entries including deprecated ones. Deprecated filtering is a build-stage concern.
10
11use crate::license_detection::index::{loaded_license_to_license, loaded_rule_to_rule};
12use crate::license_detection::models::{License, LoadedLicense, LoadedRule, Rule};
13use anyhow::{Context, Result, anyhow};
14use log::warn;
15use once_cell::sync::Lazy;
16use regex::Regex;
17use serde::{Deserialize, Deserializer, Serialize};
18use std::collections::HashSet;
19use std::fs;
20use std::path::Path;
21
22static FM_BOUNDARY: Lazy<Regex> =
23    Lazy::new(|| Regex::new(r"(?m)^-{3,}\s*$").expect("Invalid frontmatter regex"));
24
25fn deserialize_yes_no_bool<'de, D>(deserializer: D) -> Result<Option<bool>, D::Error>
26where
27    D: Deserializer<'de>,
28{
29    #[derive(Deserialize, Serialize)]
30    #[serde(untagged)]
31    enum YesNoOrBool {
32        String(String),
33        Bool(bool),
34    }
35
36    match YesNoOrBool::deserialize(deserializer)? {
37        YesNoOrBool::Bool(b) => Ok(Some(b)),
38        YesNoOrBool::String(s) => {
39            let lower = s.to_lowercase();
40            if lower == "yes" || lower == "true" || lower == "1" {
41                Ok(Some(true))
42            } else if lower == "no" || lower == "false" || lower == "0" {
43                Ok(Some(false))
44            } else {
45                Ok(None)
46            }
47        }
48    }
49}
50
51trait ParseNumber {
52    fn as_u8(&self) -> Option<u8>;
53}
54
55impl ParseNumber for yaml_serde::Number {
56    fn as_u8(&self) -> Option<u8> {
57        self.as_i64()
58            .and_then(|n| {
59                if n >= 0 && n <= u8::MAX as i64 {
60                    Some(n as u8)
61                } else {
62                    None
63                }
64            })
65            .or_else(|| {
66                self.as_f64().and_then(|f| {
67                    if f >= 0.0 && f <= u8::MAX as f64 {
68                        Some(f as u8)
69                    } else {
70                        None
71                    }
72                })
73            })
74    }
75}
76
77#[derive(Debug, Deserialize)]
78#[allow(dead_code)]
79struct LicenseFrontmatter {
80    #[serde(default)]
81    key: Option<String>,
82
83    #[serde(default)]
84    short_name: Option<String>,
85
86    #[serde(default)]
87    name: Option<String>,
88
89    #[serde(default)]
90    category: Option<String>,
91
92    #[serde(default)]
93    owner: Option<String>,
94
95    #[serde(default)]
96    homepage_url: Option<String>,
97
98    #[serde(default)]
99    notes: Option<String>,
100
101    #[serde(default)]
102    spdx_license_key: Option<String>,
103
104    #[serde(default)]
105    other_spdx_license_keys: Option<Vec<String>>,
106
107    #[serde(default)]
108    osi_license_key: Option<String>,
109
110    #[serde(default)]
111    text_urls: Option<Vec<String>>,
112
113    #[serde(default)]
114    osi_url: Option<String>,
115
116    #[serde(default)]
117    faq_url: Option<String>,
118
119    #[serde(default)]
120    other_urls: Option<Vec<String>>,
121
122    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
123    is_deprecated: Option<bool>,
124
125    #[serde(default)]
126    replaced_by: Option<Vec<String>>,
127
128    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
129    is_exception: Option<bool>,
130
131    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
132    is_unknown: Option<bool>,
133
134    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
135    is_generic: Option<bool>,
136
137    #[serde(default)]
138    minimum_coverage: Option<yaml_serde::Number>,
139
140    #[serde(default)]
141    standard_notice: Option<String>,
142
143    #[serde(default)]
144    ignorable_copyrights: Option<Vec<String>>,
145
146    #[serde(default)]
147    ignorable_holders: Option<Vec<String>>,
148
149    #[serde(default)]
150    ignorable_authors: Option<Vec<String>>,
151
152    #[serde(default)]
153    ignorable_urls: Option<Vec<String>>,
154
155    #[serde(default)]
156    ignorable_emails: Option<Vec<String>>,
157}
158
159/// Parsed rule file content, split into frontmatter and text.
160struct ParsedRuleFile {
161    yaml_content: String,
162    text_content: String,
163    has_stored_minimum_coverage: bool,
164}
165
166/// Parsed license file content, split into frontmatter and text.
167struct ParsedLicenseFile {
168    yaml_content: String,
169    text_content: String,
170}
171
172/// Parse file content into frontmatter and text sections.
173///
174/// Returns `ParsedRuleFile` with yaml_content, text_content, and metadata.
175/// The `path` parameter is used for error messages only.
176fn parse_file_content(content: &str, path: &Path) -> Result<ParsedRuleFile> {
177    if content.len() < 6 {
178        return Err(anyhow!("File content too short: {}", path.display()));
179    }
180
181    let parts: Vec<&str> = FM_BOUNDARY.splitn(content, 3).collect();
182
183    if parts.len() < 3 {
184        let trimmed = content.trim();
185        if trimmed.is_empty() {
186            return Err(anyhow!(
187                "File is empty or has no content: {}",
188                path.display()
189            ));
190        }
191        return Err(anyhow!("File missing delimiter '---': {}", path.display()));
192    }
193
194    let yaml_content = parts
195        .get(1)
196        .ok_or_else(|| anyhow!("Missing YAML frontmatter in {}", path.display()))?
197        .to_string();
198    let text_content = parts
199        .get(2)
200        .ok_or_else(|| {
201            anyhow!(
202                "Missing text content after frontmatter in {}",
203                path.display()
204            )
205        })?
206        .trim_start_matches('\n')
207        .trim()
208        .to_string();
209
210    let frontmatter_value: yaml_serde::Value =
211        yaml_serde::from_str(&yaml_content).map_err(|e| {
212            anyhow!(
213                "Failed to parse frontmatter YAML in {}: {}\nContent was:\n{}",
214                path.display(),
215                e,
216                yaml_content
217            )
218        })?;
219
220    let has_stored_minimum_coverage = frontmatter_value.as_mapping().is_some_and(|mapping| {
221        mapping.contains_key(yaml_serde::Value::String("minimum_coverage".to_string()))
222    });
223
224    Ok(ParsedRuleFile {
225        yaml_content,
226        text_content,
227        has_stored_minimum_coverage,
228    })
229}
230
231#[derive(Debug, Deserialize)]
232#[allow(dead_code)]
233struct RuleFrontmatter {
234    #[serde(default)]
235    license_expression: Option<String>,
236
237    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
238    is_license_text: Option<bool>,
239
240    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
241    is_license_notice: Option<bool>,
242
243    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
244    is_license_reference: Option<bool>,
245
246    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
247    is_license_tag: Option<bool>,
248
249    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
250    is_license_intro: Option<bool>,
251
252    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
253    is_license_clue: Option<bool>,
254
255    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
256    is_false_positive: Option<bool>,
257
258    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
259    is_required_phrase: Option<bool>,
260
261    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
262    skip_for_required_phrase_generation: Option<bool>,
263
264    #[serde(default)]
265    relevance: Option<yaml_serde::Number>,
266
267    #[serde(default)]
268    minimum_coverage: Option<yaml_serde::Number>,
269
270    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
271    is_continuous: Option<bool>,
272
273    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
274    is_deprecated: Option<bool>,
275
276    #[serde(default)]
277    referenced_filenames: Option<Vec<String>>,
278
279    #[serde(default)]
280    replaced_by: Option<Vec<String>>,
281
282    #[serde(default)]
283    ignorable_urls: Option<Vec<String>>,
284
285    #[serde(default)]
286    ignorable_emails: Option<Vec<String>>,
287
288    #[serde(default)]
289    notes: Option<String>,
290
291    #[serde(default)]
292    ignorable_copyrights: Option<Vec<String>>,
293
294    #[serde(default)]
295    ignorable_holders: Option<Vec<String>>,
296
297    #[serde(default)]
298    ignorable_authors: Option<Vec<String>>,
299
300    #[serde(default)]
301    language: Option<String>,
302}
303
304/// Parse a .RULE file into a `LoadedRule` (loader-stage).
305///
306/// This function parses the file and returns a `LoadedRule` with normalized data.
307/// Deprecated entries are included - filtering is a build-stage concern.
308///
309/// # Arguments
310/// * `path` - Path to the .RULE file
311///
312/// # Returns
313/// * `Ok(LoadedRule)` - Successfully parsed rule
314/// * `Err(...)` - Parse error with context
315pub fn parse_rule_to_loaded(path: &Path) -> Result<LoadedRule> {
316    let content = fs::read_to_string(path)
317        .with_context(|| format!("Failed to read rule file: {}", path.display()))?;
318
319    let identifier = LoadedRule::derive_identifier(
320        path.file_name()
321            .and_then(|s| s.to_str())
322            .unwrap_or("unknown.RULE"),
323    );
324
325    let parsed = parse_file_content(&content, path)?;
326
327    if parsed.text_content.is_empty() {
328        return Err(anyhow!(
329            "Rule file has empty text content: {}",
330            path.display()
331        ));
332    }
333
334    let fm: RuleFrontmatter = yaml_serde::from_str(&parsed.yaml_content).map_err(|e| {
335        anyhow!(
336            "Failed to parse rule frontmatter YAML in {}: {}\nContent was:\n{}",
337            path.display(),
338            e,
339            parsed.yaml_content
340        )
341    })?;
342
343    let is_false_positive = fm.is_false_positive.unwrap_or(false);
344
345    let rule_kind = LoadedRule::derive_rule_kind(
346        fm.is_license_text.unwrap_or(false),
347        fm.is_license_notice.unwrap_or(false),
348        fm.is_license_reference.unwrap_or(false),
349        fm.is_license_tag.unwrap_or(false),
350        fm.is_license_intro.unwrap_or(false),
351        fm.is_license_clue.unwrap_or(false),
352    )
353    .map_err(|e| {
354        anyhow!(
355            "Rule file has invalid rule-kind flags: {}: {}",
356            path.display(),
357            e
358        )
359    })?;
360
361    LoadedRule::validate_rule_kind_flags(rule_kind, is_false_positive)
362        .map_err(|e| anyhow!("Rule file has invalid flags: {}: {}", path.display(), e))?;
363
364    let license_expression = LoadedRule::normalize_license_expression(
365        fm.license_expression.as_deref(),
366        is_false_positive,
367    )
368    .map_err(|e| {
369        anyhow!(
370            "Rule file has invalid license_expression: {}: {}",
371            path.display(),
372            e
373        )
374    })?;
375
376    let relevance = fm.relevance.and_then(|n| n.as_u8());
377
378    let minimum_coverage = fm.minimum_coverage.and_then(|n| n.as_u8());
379
380    Ok(LoadedRule {
381        identifier,
382        license_expression,
383        text: parsed.text_content,
384        rule_kind,
385        is_false_positive,
386        is_required_phrase: fm.is_required_phrase.unwrap_or(false),
387        skip_for_required_phrase_generation: fm
388            .skip_for_required_phrase_generation
389            .unwrap_or(false),
390        relevance,
391        minimum_coverage,
392        has_stored_minimum_coverage: parsed.has_stored_minimum_coverage,
393        is_continuous: fm.is_continuous.unwrap_or(false),
394        referenced_filenames: LoadedRule::normalize_optional_list(
395            fm.referenced_filenames.as_deref(),
396        ),
397        ignorable_urls: LoadedRule::normalize_optional_list(fm.ignorable_urls.as_deref()),
398        ignorable_emails: LoadedRule::normalize_optional_list(fm.ignorable_emails.as_deref()),
399        ignorable_copyrights: LoadedRule::normalize_optional_list(
400            fm.ignorable_copyrights.as_deref(),
401        ),
402        ignorable_holders: LoadedRule::normalize_optional_list(fm.ignorable_holders.as_deref()),
403        ignorable_authors: LoadedRule::normalize_optional_list(fm.ignorable_authors.as_deref()),
404        language: LoadedRule::normalize_optional_string(fm.language.as_deref()),
405        notes: LoadedRule::normalize_optional_string(fm.notes.as_deref()),
406        is_deprecated: fm.is_deprecated.unwrap_or(false),
407        replaced_by: fm.replaced_by.unwrap_or_default(),
408    })
409}
410
411/// Parse a .LICENSE file into a `LoadedLicense` (loader-stage).
412///
413/// This function parses the file and returns a `LoadedLicense` with normalized data.
414/// Deprecated entries are included - filtering is a build-stage concern.
415///
416/// # Arguments
417/// * `path` - Path to the .LICENSE file
418///
419/// # Returns
420/// * `Ok(LoadedLicense)` - Successfully parsed license
421/// * `Err(...)` - Parse error with context
422pub fn parse_license_to_loaded(path: &Path) -> Result<LoadedLicense> {
423    let content = fs::read_to_string(path)
424        .with_context(|| format!("Failed to read license file: {}", path.display()))?;
425
426    let key = LoadedLicense::derive_key(path)?;
427
428    let parsed = parse_license_file_content(&content, path)?;
429
430    let fm: LicenseFrontmatter = yaml_serde::from_str(&parsed.yaml_content).map_err(|e| {
431        anyhow!(
432            "Failed to parse license frontmatter YAML in {}: {}\nContent was:\n{}",
433            path.display(),
434            e,
435            parsed.yaml_content
436        )
437    })?;
438
439    LoadedLicense::validate_key_match(&key, fm.key.as_deref())
440        .map_err(|e| anyhow!("License file has key mismatch: {}: {}", path.display(), e))?;
441
442    let is_deprecated = fm.is_deprecated.unwrap_or(false);
443    let is_unknown = fm.is_unknown.unwrap_or(false);
444    let is_generic = fm.is_generic.unwrap_or(false);
445
446    LoadedLicense::validate_text_content(
447        &parsed.text_content,
448        is_deprecated,
449        is_unknown,
450        is_generic,
451    )
452    .map_err(|e| {
453        anyhow!(
454            "License file has invalid content: {}: {}",
455            path.display(),
456            e
457        )
458    })?;
459
460    let name = LoadedLicense::derive_name(fm.name.as_deref(), fm.short_name.as_deref(), &key);
461
462    let reference_urls = LoadedLicense::merge_reference_urls(
463        fm.text_urls.as_deref(),
464        fm.other_urls.as_deref(),
465        fm.osi_url.as_deref(),
466        fm.faq_url.as_deref(),
467        fm.homepage_url.as_deref(),
468    );
469
470    let minimum_coverage = fm.minimum_coverage.and_then(|n| n.as_u8());
471
472    Ok(LoadedLicense {
473        key,
474        short_name: LoadedLicense::normalize_optional_string(fm.short_name.as_deref()),
475        name,
476        language: Some("en".to_string()),
477        spdx_license_key: LoadedLicense::normalize_optional_string(fm.spdx_license_key.as_deref()),
478        other_spdx_license_keys: fm.other_spdx_license_keys.unwrap_or_default(),
479        category: LoadedLicense::normalize_optional_string(fm.category.as_deref()),
480        owner: LoadedLicense::normalize_optional_string(fm.owner.as_deref()),
481        homepage_url: LoadedLicense::normalize_optional_string(fm.homepage_url.as_deref()),
482        text: parsed.text_content,
483        reference_urls,
484        osi_license_key: LoadedLicense::normalize_optional_string(fm.osi_license_key.as_deref()),
485        text_urls: LoadedLicense::normalize_optional_list(fm.text_urls.as_deref())
486            .unwrap_or_default(),
487        osi_url: LoadedLicense::normalize_optional_string(fm.osi_url.as_deref()),
488        faq_url: LoadedLicense::normalize_optional_string(fm.faq_url.as_deref()),
489        other_urls: LoadedLicense::normalize_optional_list(fm.other_urls.as_deref())
490            .unwrap_or_default(),
491        notes: LoadedLicense::normalize_optional_string(fm.notes.as_deref()),
492        is_deprecated,
493        is_exception: fm.is_exception.unwrap_or(false),
494        is_unknown,
495        is_generic,
496        replaced_by: fm.replaced_by.unwrap_or_default(),
497        minimum_coverage,
498        standard_notice: LoadedLicense::normalize_optional_string(fm.standard_notice.as_deref()),
499        ignorable_copyrights: LoadedLicense::normalize_optional_list(
500            fm.ignorable_copyrights.as_deref(),
501        ),
502        ignorable_holders: LoadedLicense::normalize_optional_list(fm.ignorable_holders.as_deref()),
503        ignorable_authors: LoadedLicense::normalize_optional_list(fm.ignorable_authors.as_deref()),
504        ignorable_urls: LoadedLicense::normalize_optional_list(fm.ignorable_urls.as_deref()),
505        ignorable_emails: LoadedLicense::normalize_optional_list(fm.ignorable_emails.as_deref()),
506    })
507}
508
509/// Parse license file content into frontmatter and text sections.
510///
511/// The `path` parameter is used for error messages only.
512fn parse_license_file_content(content: &str, path: &Path) -> Result<ParsedLicenseFile> {
513    if content.len() < 6 {
514        return Err(anyhow!(
515            "License file content too short: {}",
516            path.display()
517        ));
518    }
519
520    let parts: Vec<&str> = FM_BOUNDARY.splitn(content, 3).collect();
521
522    if parts.len() < 3 {
523        let trimmed = content.trim();
524        if trimmed.is_empty() {
525            return Err(anyhow!(
526                "License file is empty or has no content: {}",
527                path.display()
528            ));
529        }
530        return Err(anyhow!(
531            "License file missing delimiter '---': {}",
532            path.display()
533        ));
534    }
535
536    let yaml_content = parts
537        .get(1)
538        .ok_or_else(|| anyhow!("Missing YAML frontmatter in {}", path.display()))?
539        .to_string();
540    let text_content = parts
541        .get(2)
542        .ok_or_else(|| {
543            anyhow!(
544                "Missing text content after frontmatter in {}",
545                path.display()
546            )
547        })?
548        .trim_start_matches('\n')
549        .trim()
550        .to_string();
551
552    Ok(ParsedLicenseFile {
553        yaml_content,
554        text_content,
555    })
556}
557
558/// Load all .RULE files from a directory into `LoadedRule` values (loader-stage).
559///
560/// This function loads ALL rules, including deprecated ones.
561/// Deprecated filtering is a build-stage concern.
562///
563/// # Arguments
564/// * `dir` - Directory containing .RULE files
565///
566/// # Returns
567/// * `Ok(Vec<LoadedRule>)` - All loaded rules (including deprecated)
568/// * `Err(...)` - Directory read error
569pub fn load_loaded_rules_from_directory(dir: &Path) -> Result<Vec<LoadedRule>> {
570    let mut rules = Vec::new();
571
572    let entries = fs::read_dir(dir)
573        .with_context(|| format!("Failed to read rules directory: {}", dir.display()))?;
574
575    for entry in entries {
576        let entry = entry
577            .with_context(|| format!("Failed to read directory entry in: {}", dir.display()))?;
578        let path = entry.path();
579
580        if path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("RULE") {
581            match parse_rule_to_loaded(&path) {
582                Ok(rule) => rules.push(rule),
583                Err(e) => {
584                    warn!("Failed to parse rule file {}: {}", path.display(), e);
585                }
586            }
587        }
588    }
589
590    Ok(rules)
591}
592
593/// Load all .LICENSE files from a directory into `LoadedLicense` values (loader-stage).
594///
595/// This function loads ALL licenses, including deprecated ones.
596/// Deprecated filtering is a build-stage concern.
597///
598/// # Arguments
599/// * `dir` - Directory containing .LICENSE files
600///
601/// # Returns
602/// * `Ok(Vec<LoadedLicense>)` - All loaded licenses (including deprecated)
603/// * `Err(...)` - Directory read error
604pub fn load_loaded_licenses_from_directory(dir: &Path) -> Result<Vec<LoadedLicense>> {
605    let mut licenses = Vec::new();
606
607    let entries = fs::read_dir(dir)
608        .with_context(|| format!("Failed to read licenses directory: {}", dir.display()))?;
609
610    for entry in entries {
611        let entry = entry
612            .with_context(|| format!("Failed to read directory entry in: {}", dir.display()))?;
613        let path = entry.path();
614
615        if path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("LICENSE") {
616            match parse_license_to_loaded(&path) {
617                Ok(license) => licenses.push(license),
618                Err(e) => {
619                    warn!("Failed to parse license file {}: {}", path.display(), e);
620                }
621            }
622        }
623    }
624
625    Ok(licenses)
626}
627
628/// Validate loaded rules for common issues.
629///
630/// Checks for:
631/// 1. Duplicate rule texts (warns if found)
632/// 2. Empty license expressions for non-false-positive rules (warns if found)
633///
634/// Corresponds to Python:
635/// - `models.py:validate()` for license expression validation
636/// - `index.py:_add_rules()` for duplicate detection via hash
637///
638/// Kept for backward compatibility with `load_rules_from_directory`.
639#[allow(dead_code)]
640fn validate_rules(rules: &[Rule]) {
641    let mut seen_texts: HashSet<&str> = HashSet::new();
642    let mut duplicate_count = 0;
643
644    for rule in rules {
645        if !seen_texts.insert(&rule.text) {
646            warn!(
647                "Duplicate rule text found for license_expression: {}",
648                rule.license_expression
649            );
650            duplicate_count += 1;
651        }
652
653        if !rule.is_false_positive && rule.license_expression.trim().is_empty() {
654            warn!("Rule has empty license_expression but is not marked as false_positive");
655        }
656    }
657
658    if duplicate_count > 0 {
659        warn!(
660            "Found {} duplicate rule text(s) during rule validation",
661            duplicate_count
662        );
663    }
664}
665
666/// Load all .RULE files from a directory into `Rule` values (backward-compatible).
667///
668/// This function loads rules and applies deprecated filtering during loading.
669/// For the two-stage pipeline, prefer `load_loaded_rules_from_directory` and
670/// `build_index_from_loaded`.
671///
672/// Kept for backward compatibility and testing despite not being used in production code.
673/// The new pipeline uses the two-stage loading process instead.
674#[allow(dead_code)]
675pub fn load_rules_from_directory(dir: &Path, with_deprecated: bool) -> Result<Vec<Rule>> {
676    let loaded = load_loaded_rules_from_directory(dir)?;
677    let rules: Vec<Rule> = loaded
678        .into_iter()
679        .filter(|r| with_deprecated || !r.is_deprecated)
680        .map(loaded_rule_to_rule)
681        .collect();
682    validate_rules(&rules);
683    Ok(rules)
684}
685
686/// Load all .LICENSE files from a directory into `License` values (backward-compatible).
687///
688/// This function loads licenses and applies deprecated filtering during loading.
689/// For the two-stage pipeline, prefer `load_loaded_licenses_from_directory` and
690/// `build_index_from_loaded`.
691///
692/// Kept for backward compatibility and testing despite not being used in production code.
693/// The new pipeline uses the two-stage loading process instead.
694#[allow(dead_code)]
695pub fn load_licenses_from_directory(dir: &Path, with_deprecated: bool) -> Result<Vec<License>> {
696    let loaded = load_loaded_licenses_from_directory(dir)?;
697    let licenses: Vec<License> = loaded
698        .into_iter()
699        .filter(|l| with_deprecated || !l.is_deprecated)
700        .map(loaded_license_to_license)
701        .collect();
702    Ok(licenses)
703}
704
705#[cfg(test)]
706mod tests {
707    use super::*;
708    use std::collections::HashMap;
709    use std::fs;
710    use tempfile::tempdir;
711
712    pub fn parse_rule_file(path: &Path) -> Result<Rule> {
713        let loaded = parse_rule_to_loaded(path)?;
714        Ok(loaded_rule_to_rule(loaded))
715    }
716
717    #[test]
718    fn test_parse_number_as_u8() {
719        let num_int: yaml_serde::Number = yaml_serde::from_str("100").unwrap();
720        assert_eq!(num_int.as_u8(), Some(100));
721
722        let num_out_of_range: yaml_serde::Number = yaml_serde::from_str("500").unwrap();
723        assert_eq!(num_out_of_range.as_u8(), None);
724
725        let num_float: yaml_serde::Number = yaml_serde::from_str("90.5").unwrap();
726        assert_eq!(num_float.as_u8(), Some(90));
727    }
728
729    #[test]
730    fn test_parse_simple_license_file() {
731        let dir = tempdir().unwrap();
732        let license_path = dir.path().join("mit.LICENSE");
733        fs::write(
734            &license_path,
735            r#"---
736key: mit
737short_name: MIT License
738name: MIT License
739category: Permissive
740spdx_license_key: MIT
741---
742MIT License text here"#,
743        )
744        .unwrap();
745
746        let license = parse_license_to_loaded(&license_path)
747            .map(loaded_license_to_license)
748            .unwrap();
749        assert_eq!(license.key, "mit");
750        assert_eq!(license.name, "MIT License");
751        assert!(license.text.contains("MIT License text"));
752    }
753
754    #[test]
755    fn test_parse_simple_rule_file() {
756        let dir = tempdir().unwrap();
757        let rule_path = dir.path().join("mit_1.RULE");
758        fs::write(
759            &rule_path,
760            r#"---
761license_expression: mit
762is_license_reference: yes
763relevance: 90
764referenced_filenames:
765    - MIT.txt
766---
767MIT.txt"#,
768        )
769        .unwrap();
770
771        let rule = parse_rule_file(&rule_path).unwrap();
772        assert_eq!(rule.license_expression, "mit");
773        assert_eq!(rule.text, "MIT.txt");
774        assert!(rule.is_license_reference());
775        assert_eq!(rule.relevance, 90);
776    }
777
778    #[test]
779    fn test_deserialize_yes_no_bool() {
780        let dir = tempdir().unwrap();
781        let rule_path = dir.path().join("test.RULE");
782
783        fs::write(
784            &rule_path,
785            r#"---
786license_expression: mit
787is_license_notice: yes
788is_license_tag: no
789---
790MIT License"#,
791        )
792        .unwrap();
793
794        let rule = parse_rule_file(&rule_path).unwrap();
795        assert!(rule.is_license_notice());
796        assert!(!rule.is_license_tag());
797    }
798
799    #[test]
800    fn test_load_licenses_from_directory() {
801        let dir = tempdir().unwrap();
802
803        fs::write(
804            dir.path().join("test.LICENSE"),
805            r#"---
806key: test
807name: Test License
808spdx_license_key: TEST
809category: Permissive
810---
811Test license text here"#,
812        )
813        .unwrap();
814
815        let licenses = load_licenses_from_directory(dir.path(), false).unwrap();
816        assert_eq!(licenses.len(), 1);
817
818        let license = &licenses[0];
819        assert_eq!(license.key, "test");
820        assert_eq!(license.name, "Test License");
821        assert_eq!(license.spdx_license_key, Some("TEST".to_string()));
822        assert!(!license.text.is_empty());
823    }
824
825    #[test]
826    fn test_load_rules_from_directory() {
827        let dir = tempdir().unwrap();
828
829        fs::write(
830            dir.path().join("test_1.RULE"),
831            r#"---
832license_expression: test
833is_license_reference: yes
834relevance: 85
835referenced_filenames:
836    - TEST.txt
837---
838TEST.txt"#,
839        )
840        .unwrap();
841
842        let rules = load_rules_from_directory(dir.path(), false).unwrap();
843        assert_eq!(rules.len(), 1);
844
845        let rule = &rules[0];
846        assert_eq!(rule.license_expression, "test");
847        assert!(rule.is_license_reference());
848        assert_eq!(rule.relevance, 85);
849    }
850
851    #[test]
852    fn test_validate_rules_detects_duplicates() {
853        let rules = vec![
854            Rule {
855                identifier: "mit.LICENSE".to_string(),
856                license_expression: "mit".to_string(),
857                text: "MIT License".to_string(),
858                tokens: vec![],
859                rule_kind: crate::license_detection::models::RuleKind::Text,
860                is_false_positive: false,
861                is_required_phrase: false,
862                is_from_license: false,
863                relevance: 100,
864                minimum_coverage: None,
865                has_stored_minimum_coverage: false,
866                is_continuous: false,
867                required_phrase_spans: vec![],
868                stopwords_by_pos: HashMap::new(),
869                referenced_filenames: None,
870                ignorable_urls: None,
871                ignorable_emails: None,
872                ignorable_copyrights: None,
873                ignorable_holders: None,
874                ignorable_authors: None,
875                language: None,
876                notes: None,
877                length_unique: 0,
878                high_length_unique: 0,
879                high_length: 0,
880                min_matched_length: 0,
881                min_high_matched_length: 0,
882                min_matched_length_unique: 0,
883                min_high_matched_length_unique: 0,
884                is_small: false,
885                is_tiny: false,
886                starts_with_license: false,
887                ends_with_license: false,
888                is_deprecated: false,
889                spdx_license_key: None,
890                other_spdx_license_keys: vec![],
891            },
892            Rule {
893                identifier: "apache-2.0.LICENSE".to_string(),
894                license_expression: "apache-2.0".to_string(),
895                text: "MIT License".to_string(),
896                tokens: vec![],
897                rule_kind: crate::license_detection::models::RuleKind::Text,
898                is_false_positive: false,
899                is_required_phrase: false,
900                is_from_license: false,
901                relevance: 100,
902                minimum_coverage: None,
903                has_stored_minimum_coverage: false,
904                is_continuous: false,
905                required_phrase_spans: vec![],
906                stopwords_by_pos: HashMap::new(),
907                referenced_filenames: None,
908                ignorable_urls: None,
909                ignorable_emails: None,
910                ignorable_copyrights: None,
911                ignorable_holders: None,
912                ignorable_authors: None,
913                language: None,
914                notes: None,
915                length_unique: 0,
916                high_length_unique: 0,
917                high_length: 0,
918                min_matched_length: 0,
919                min_high_matched_length: 0,
920                min_matched_length_unique: 0,
921                min_high_matched_length_unique: 0,
922                is_small: false,
923                is_tiny: false,
924                starts_with_license: false,
925                ends_with_license: false,
926                is_deprecated: false,
927                spdx_license_key: None,
928                other_spdx_license_keys: vec![],
929            },
930        ];
931
932        validate_rules(&rules);
933    }
934
935    #[test]
936    fn test_validate_rules_accepts_false_positive_without_expression() {
937        let rules = vec![Rule {
938            identifier: "fp.RULE".to_string(),
939            license_expression: "".to_string(),
940            text: "Some text".to_string(),
941            tokens: vec![],
942            rule_kind: crate::license_detection::models::RuleKind::None,
943            is_false_positive: true,
944            is_required_phrase: false,
945            is_from_license: false,
946            relevance: 100,
947            minimum_coverage: None,
948            has_stored_minimum_coverage: false,
949            is_continuous: false,
950            required_phrase_spans: vec![],
951            stopwords_by_pos: HashMap::new(),
952            referenced_filenames: None,
953            ignorable_urls: None,
954            ignorable_emails: None,
955            ignorable_copyrights: None,
956            ignorable_holders: None,
957            ignorable_authors: None,
958            language: None,
959            notes: Some("False positive for common pattern".to_string()),
960            length_unique: 0,
961            high_length_unique: 0,
962            high_length: 0,
963            min_matched_length: 0,
964            min_high_matched_length: 0,
965            min_matched_length_unique: 0,
966            min_high_matched_length_unique: 0,
967            is_small: false,
968            is_tiny: false,
969            starts_with_license: false,
970            ends_with_license: false,
971            is_deprecated: false,
972            spdx_license_key: None,
973            other_spdx_license_keys: vec![],
974        }];
975
976        validate_rules(&rules);
977    }
978
979    #[test]
980    fn test_validate_rules_no_duplicates() {
981        let rules = vec![
982            Rule {
983                identifier: "mit.LICENSE".to_string(),
984                license_expression: "mit".to_string(),
985                text: "MIT License".to_string(),
986                tokens: vec![],
987                rule_kind: crate::license_detection::models::RuleKind::Text,
988                is_false_positive: false,
989                is_required_phrase: false,
990                is_from_license: false,
991                relevance: 100,
992                minimum_coverage: None,
993                has_stored_minimum_coverage: false,
994                is_continuous: false,
995                required_phrase_spans: vec![],
996                stopwords_by_pos: HashMap::new(),
997                referenced_filenames: None,
998                ignorable_urls: None,
999                ignorable_emails: None,
1000                ignorable_copyrights: None,
1001                ignorable_holders: None,
1002                ignorable_authors: None,
1003                language: None,
1004                notes: None,
1005                length_unique: 0,
1006                high_length_unique: 0,
1007                high_length: 0,
1008                min_matched_length: 0,
1009                min_high_matched_length: 0,
1010                min_matched_length_unique: 0,
1011                min_high_matched_length_unique: 0,
1012                is_small: false,
1013                is_tiny: false,
1014                starts_with_license: false,
1015                ends_with_license: false,
1016                is_deprecated: false,
1017                spdx_license_key: None,
1018                other_spdx_license_keys: vec![],
1019            },
1020            Rule {
1021                identifier: "apache-2.0.LICENSE".to_string(),
1022                license_expression: "apache-2.0".to_string(),
1023                text: "Apache License".to_string(),
1024                tokens: vec![],
1025                rule_kind: crate::license_detection::models::RuleKind::Text,
1026                is_false_positive: false,
1027                is_required_phrase: false,
1028                is_from_license: false,
1029                relevance: 100,
1030                minimum_coverage: None,
1031                has_stored_minimum_coverage: false,
1032                is_continuous: false,
1033                required_phrase_spans: vec![],
1034                stopwords_by_pos: HashMap::new(),
1035                referenced_filenames: None,
1036                ignorable_urls: None,
1037                ignorable_emails: None,
1038                ignorable_copyrights: None,
1039                ignorable_holders: None,
1040                ignorable_authors: None,
1041                language: None,
1042                notes: None,
1043                length_unique: 0,
1044                high_length_unique: 0,
1045                high_length: 0,
1046                min_matched_length: 0,
1047                min_high_matched_length: 0,
1048                min_matched_length_unique: 0,
1049                min_high_matched_length_unique: 0,
1050                is_small: false,
1051                is_tiny: false,
1052                starts_with_license: false,
1053                ends_with_license: false,
1054                is_deprecated: false,
1055                spdx_license_key: None,
1056                other_spdx_license_keys: vec![],
1057            },
1058        ];
1059
1060        validate_rules(&rules);
1061    }
1062
1063    #[test]
1064    fn test_load_licenses_filters_deprecated_by_default() {
1065        let dir = tempdir().unwrap();
1066
1067        fs::write(
1068            dir.path().join("active.LICENSE"),
1069            r#"---
1070key: active
1071name: Active License
1072---
1073Active license text"#,
1074        )
1075        .unwrap();
1076
1077        fs::write(
1078            dir.path().join("deprecated.LICENSE"),
1079            r#"---
1080key: deprecated
1081name: Deprecated License
1082is_deprecated: yes
1083---
1084Deprecated license text"#,
1085        )
1086        .unwrap();
1087
1088        let licenses_without = load_licenses_from_directory(dir.path(), false).unwrap();
1089        assert_eq!(licenses_without.len(), 1);
1090        assert_eq!(licenses_without[0].key, "active");
1091
1092        let licenses_with = load_licenses_from_directory(dir.path(), true).unwrap();
1093        assert_eq!(licenses_with.len(), 2);
1094    }
1095
1096    #[test]
1097    fn test_load_rules_filters_deprecated_by_default() {
1098        let dir = tempdir().unwrap();
1099
1100        fs::write(
1101            dir.path().join("active.RULE"),
1102            r#"---
1103license_expression: active
1104is_license_notice: yes
1105---
1106Active rule text"#,
1107        )
1108        .unwrap();
1109
1110        fs::write(
1111            dir.path().join("deprecated.RULE"),
1112            r#"---
1113license_expression: deprecated
1114is_license_notice: yes
1115is_deprecated: yes
1116---
1117Deprecated rule text"#,
1118        )
1119        .unwrap();
1120
1121        let rules_without = load_rules_from_directory(dir.path(), false).unwrap();
1122        assert_eq!(rules_without.len(), 1);
1123        assert_eq!(rules_without[0].license_expression, "active");
1124
1125        let rules_with = load_rules_from_directory(dir.path(), true).unwrap();
1126        assert_eq!(rules_with.len(), 2);
1127    }
1128
1129    #[test]
1130    fn test_parse_rule_to_loaded() {
1131        let dir = tempdir().unwrap();
1132        let rule_path = dir.path().join("mit_1.RULE");
1133        fs::write(
1134            &rule_path,
1135            r#"---
1136license_expression: mit
1137is_license_reference: yes
1138relevance: 90
1139referenced_filenames:
1140    - MIT.txt
1141---
1142MIT.txt"#,
1143        )
1144        .unwrap();
1145
1146        let loaded = parse_rule_to_loaded(&rule_path).unwrap();
1147        assert_eq!(loaded.identifier, "mit_1.RULE");
1148        assert_eq!(loaded.license_expression, "mit");
1149        assert_eq!(loaded.text, "MIT.txt");
1150        assert_eq!(
1151            loaded.rule_kind,
1152            crate::license_detection::models::RuleKind::Reference
1153        );
1154        assert_eq!(loaded.relevance, Some(90));
1155        assert_eq!(
1156            loaded.referenced_filenames,
1157            Some(vec!["MIT.txt".to_string()])
1158        );
1159        assert!(!loaded.is_deprecated);
1160    }
1161
1162    #[test]
1163    fn test_parse_license_to_loaded() {
1164        let dir = tempdir().unwrap();
1165        let license_path = dir.path().join("mit.LICENSE");
1166        fs::write(
1167            &license_path,
1168            r#"---
1169key: mit
1170short_name: MIT License
1171name: MIT License
1172category: Permissive
1173spdx_license_key: MIT
1174---
1175MIT License text here"#,
1176        )
1177        .unwrap();
1178
1179        let loaded = parse_license_to_loaded(&license_path).unwrap();
1180        assert_eq!(loaded.key, "mit");
1181        assert_eq!(loaded.name, "MIT License");
1182        assert!(loaded.text.contains("MIT License text"));
1183        assert_eq!(loaded.spdx_license_key, Some("MIT".to_string()));
1184    }
1185
1186    #[test]
1187    fn test_load_loaded_rules_from_directory_includes_deprecated() {
1188        let dir = tempdir().unwrap();
1189
1190        fs::write(
1191            dir.path().join("active.RULE"),
1192            r#"---
1193license_expression: active
1194is_license_notice: yes
1195---
1196Active rule text"#,
1197        )
1198        .unwrap();
1199
1200        fs::write(
1201            dir.path().join("deprecated.RULE"),
1202            r#"---
1203license_expression: deprecated
1204is_license_notice: yes
1205is_deprecated: yes
1206---
1207Deprecated rule text"#,
1208        )
1209        .unwrap();
1210
1211        let loaded_rules = load_loaded_rules_from_directory(dir.path()).unwrap();
1212        assert_eq!(loaded_rules.len(), 2);
1213
1214        let active = loaded_rules
1215            .iter()
1216            .find(|r| r.license_expression == "active")
1217            .unwrap();
1218        assert!(!active.is_deprecated);
1219
1220        let deprecated = loaded_rules
1221            .iter()
1222            .find(|r| r.license_expression == "deprecated")
1223            .unwrap();
1224        assert!(deprecated.is_deprecated);
1225    }
1226
1227    #[test]
1228    fn test_load_loaded_licenses_from_directory_includes_deprecated() {
1229        let dir = tempdir().unwrap();
1230
1231        fs::write(
1232            dir.path().join("active.LICENSE"),
1233            r#"---
1234key: active
1235name: Active License
1236---
1237Active license text"#,
1238        )
1239        .unwrap();
1240
1241        fs::write(
1242            dir.path().join("deprecated.LICENSE"),
1243            r#"---
1244key: deprecated
1245name: Deprecated License
1246is_deprecated: yes
1247---
1248Deprecated license text"#,
1249        )
1250        .unwrap();
1251
1252        let loaded_licenses = load_loaded_licenses_from_directory(dir.path()).unwrap();
1253        assert_eq!(loaded_licenses.len(), 2);
1254
1255        let active = loaded_licenses.iter().find(|l| l.key == "active").unwrap();
1256        assert!(!active.is_deprecated);
1257
1258        let deprecated = loaded_licenses
1259            .iter()
1260            .find(|l| l.key == "deprecated")
1261            .unwrap();
1262        assert!(deprecated.is_deprecated);
1263    }
1264}