Skip to main content

provenant/license_detection/rules/
loader.rs

1//! Parse .LICENSE and .RULE files.
2//!
3//! This module provides two-stage loading:
4//! 1. Loader-stage: Parse files into `LoadedRule` and `LoadedLicense`
5//! 2. Build-stage: Convert to runtime `Rule` and `License` (deprecated filtering, etc.)
6//!
7//! The loader-stage functions (`parse_rule_to_loaded`, `parse_license_to_loaded`,
8//! `load_loaded_rules_from_directory`, `load_loaded_licenses_from_directory`) return
9//! all entries including deprecated ones. Deprecated filtering is a build-stage concern.
10
11use crate::license_detection::index::{loaded_license_to_license, loaded_rule_to_rule};
12use crate::license_detection::models::{License, LoadedLicense, LoadedRule, Rule};
13use anyhow::{Context, Result, anyhow};
14use log::warn;
15use once_cell::sync::Lazy;
16use regex::Regex;
17use serde::{Deserialize, Deserializer, Serialize};
18use std::collections::HashSet;
19use std::fs;
20use std::path::Path;
21
22static FM_BOUNDARY: Lazy<Regex> =
23    Lazy::new(|| Regex::new(r"(?m)^-{3,}\s*$").expect("Invalid frontmatter regex"));
24
25fn deserialize_yes_no_bool<'de, D>(deserializer: D) -> Result<Option<bool>, D::Error>
26where
27    D: Deserializer<'de>,
28{
29    #[derive(Deserialize, Serialize)]
30    #[serde(untagged)]
31    enum YesNoOrBool {
32        String(String),
33        Bool(bool),
34    }
35
36    match YesNoOrBool::deserialize(deserializer)? {
37        YesNoOrBool::Bool(b) => Ok(Some(b)),
38        YesNoOrBool::String(s) => {
39            let lower = s.to_lowercase();
40            if lower == "yes" || lower == "true" || lower == "1" {
41                Ok(Some(true))
42            } else if lower == "no" || lower == "false" || lower == "0" {
43                Ok(Some(false))
44            } else {
45                Ok(None)
46            }
47        }
48    }
49}
50
51trait ParseNumber {
52    fn as_u8(&self) -> Option<u8>;
53}
54
55impl ParseNumber for serde_yaml::Number {
56    fn as_u8(&self) -> Option<u8> {
57        self.as_i64()
58            .and_then(|n| {
59                if n >= 0 && n <= u8::MAX as i64 {
60                    Some(n as u8)
61                } else {
62                    None
63                }
64            })
65            .or_else(|| {
66                self.as_f64().and_then(|f| {
67                    if f >= 0.0 && f <= u8::MAX as f64 {
68                        Some(f as u8)
69                    } else {
70                        None
71                    }
72                })
73            })
74    }
75}
76
77#[derive(Debug, Deserialize)]
78#[allow(dead_code)]
79struct LicenseFrontmatter {
80    #[serde(default)]
81    key: Option<String>,
82
83    #[serde(default)]
84    short_name: Option<String>,
85
86    #[serde(default)]
87    name: Option<String>,
88
89    #[serde(default)]
90    category: Option<String>,
91
92    #[serde(default)]
93    owner: Option<String>,
94
95    #[serde(default)]
96    homepage_url: Option<String>,
97
98    #[serde(default)]
99    notes: Option<String>,
100
101    #[serde(default)]
102    spdx_license_key: Option<String>,
103
104    #[serde(default)]
105    other_spdx_license_keys: Option<Vec<String>>,
106
107    #[serde(default)]
108    osi_license_key: Option<String>,
109
110    #[serde(default)]
111    text_urls: Option<Vec<String>>,
112
113    #[serde(default)]
114    osi_url: Option<String>,
115
116    #[serde(default)]
117    faq_url: Option<String>,
118
119    #[serde(default)]
120    other_urls: Option<Vec<String>>,
121
122    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
123    is_deprecated: Option<bool>,
124
125    #[serde(default)]
126    replaced_by: Option<Vec<String>>,
127
128    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
129    is_exception: Option<bool>,
130
131    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
132    is_unknown: Option<bool>,
133
134    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
135    is_generic: Option<bool>,
136
137    #[serde(default)]
138    minimum_coverage: Option<serde_yaml::Number>,
139
140    #[serde(default)]
141    standard_notice: Option<String>,
142
143    #[serde(default)]
144    ignorable_copyrights: Option<Vec<String>>,
145
146    #[serde(default)]
147    ignorable_holders: Option<Vec<String>>,
148
149    #[serde(default)]
150    ignorable_authors: Option<Vec<String>>,
151
152    #[serde(default)]
153    ignorable_urls: Option<Vec<String>>,
154
155    #[serde(default)]
156    ignorable_emails: Option<Vec<String>>,
157}
158
159/// Parsed rule file content, split into frontmatter and text.
160struct ParsedRuleFile {
161    yaml_content: String,
162    text_content: String,
163    has_stored_minimum_coverage: bool,
164}
165
166/// Parsed license file content, split into frontmatter and text.
167struct ParsedLicenseFile {
168    yaml_content: String,
169    text_content: String,
170}
171
172/// Parse file content into frontmatter and text sections.
173///
174/// Returns `ParsedRuleFile` with yaml_content, text_content, and metadata.
175/// The `path` parameter is used for error messages only.
176fn parse_file_content(content: &str, path: &Path) -> Result<ParsedRuleFile> {
177    if content.len() < 6 {
178        return Err(anyhow!("File content too short: {}", path.display()));
179    }
180
181    let parts: Vec<&str> = FM_BOUNDARY.splitn(content, 3).collect();
182
183    if parts.len() < 3 {
184        let trimmed = content.trim();
185        if trimmed.is_empty() {
186            return Err(anyhow!(
187                "File is empty or has no content: {}",
188                path.display()
189            ));
190        }
191        return Err(anyhow!("File missing delimiter '---': {}", path.display()));
192    }
193
194    let yaml_content = parts
195        .get(1)
196        .ok_or_else(|| anyhow!("Missing YAML frontmatter in {}", path.display()))?
197        .to_string();
198    let text_content = parts
199        .get(2)
200        .ok_or_else(|| {
201            anyhow!(
202                "Missing text content after frontmatter in {}",
203                path.display()
204            )
205        })?
206        .trim_start_matches('\n')
207        .trim()
208        .to_string();
209
210    let frontmatter_value: serde_yaml::Value =
211        serde_yaml::from_str(&yaml_content).map_err(|e| {
212            anyhow!(
213                "Failed to parse frontmatter YAML in {}: {}\nContent was:\n{}",
214                path.display(),
215                e,
216                yaml_content
217            )
218        })?;
219
220    let has_stored_minimum_coverage = frontmatter_value.as_mapping().is_some_and(|mapping| {
221        mapping.contains_key(serde_yaml::Value::String("minimum_coverage".to_string()))
222    });
223
224    Ok(ParsedRuleFile {
225        yaml_content,
226        text_content,
227        has_stored_minimum_coverage,
228    })
229}
230
231#[derive(Debug, Deserialize)]
232#[allow(dead_code)]
233struct RuleFrontmatter {
234    #[serde(default)]
235    license_expression: Option<String>,
236
237    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
238    is_license_text: Option<bool>,
239
240    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
241    is_license_notice: Option<bool>,
242
243    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
244    is_license_reference: Option<bool>,
245
246    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
247    is_license_tag: Option<bool>,
248
249    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
250    is_license_intro: Option<bool>,
251
252    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
253    is_license_clue: Option<bool>,
254
255    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
256    is_false_positive: Option<bool>,
257
258    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
259    is_required_phrase: Option<bool>,
260
261    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
262    skip_for_required_phrase_generation: Option<bool>,
263
264    #[serde(default)]
265    relevance: Option<serde_yaml::Number>,
266
267    #[serde(default)]
268    minimum_coverage: Option<serde_yaml::Number>,
269
270    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
271    is_continuous: Option<bool>,
272
273    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
274    is_deprecated: Option<bool>,
275
276    #[serde(default)]
277    referenced_filenames: Option<Vec<String>>,
278
279    #[serde(default)]
280    replaced_by: Option<Vec<String>>,
281
282    #[serde(default)]
283    ignorable_urls: Option<Vec<String>>,
284
285    #[serde(default)]
286    ignorable_emails: Option<Vec<String>>,
287
288    #[serde(default)]
289    notes: Option<String>,
290
291    #[serde(default)]
292    ignorable_copyrights: Option<Vec<String>>,
293
294    #[serde(default)]
295    ignorable_holders: Option<Vec<String>>,
296
297    #[serde(default)]
298    ignorable_authors: Option<Vec<String>>,
299
300    #[serde(default)]
301    language: Option<String>,
302}
303
304/// Parse a .RULE file into a `LoadedRule` (loader-stage).
305///
306/// This function parses the file and returns a `LoadedRule` with normalized data.
307/// Deprecated entries are included - filtering is a build-stage concern.
308///
309/// # Arguments
310/// * `path` - Path to the .RULE file
311///
312/// # Returns
313/// * `Ok(LoadedRule)` - Successfully parsed rule
314/// * `Err(...)` - Parse error with context
315pub fn parse_rule_to_loaded(path: &Path) -> Result<LoadedRule> {
316    let content = fs::read_to_string(path)
317        .with_context(|| format!("Failed to read rule file: {}", path.display()))?;
318
319    let identifier = LoadedRule::derive_identifier(
320        path.file_name()
321            .and_then(|s| s.to_str())
322            .unwrap_or("unknown.RULE"),
323    );
324
325    let parsed = parse_file_content(&content, path)?;
326
327    if parsed.text_content.is_empty() {
328        return Err(anyhow!(
329            "Rule file has empty text content: {}",
330            path.display()
331        ));
332    }
333
334    let fm: RuleFrontmatter = serde_yaml::from_str(&parsed.yaml_content).map_err(|e| {
335        anyhow!(
336            "Failed to parse rule frontmatter YAML in {}: {}\nContent was:\n{}",
337            path.display(),
338            e,
339            parsed.yaml_content
340        )
341    })?;
342
343    let is_false_positive = fm.is_false_positive.unwrap_or(false);
344
345    let rule_kind = LoadedRule::derive_rule_kind(
346        fm.is_license_text.unwrap_or(false),
347        fm.is_license_notice.unwrap_or(false),
348        fm.is_license_reference.unwrap_or(false),
349        fm.is_license_tag.unwrap_or(false),
350        fm.is_license_intro.unwrap_or(false),
351        fm.is_license_clue.unwrap_or(false),
352    )
353    .map_err(|e| {
354        anyhow!(
355            "Rule file has invalid rule-kind flags: {}: {}",
356            path.display(),
357            e
358        )
359    })?;
360
361    LoadedRule::validate_rule_kind_flags(rule_kind, is_false_positive)
362        .map_err(|e| anyhow!("Rule file has invalid flags: {}: {}", path.display(), e))?;
363
364    let license_expression = LoadedRule::normalize_license_expression(
365        fm.license_expression.as_deref(),
366        is_false_positive,
367    )
368    .map_err(|e| {
369        anyhow!(
370            "Rule file has invalid license_expression: {}: {}",
371            path.display(),
372            e
373        )
374    })?;
375
376    let relevance = fm.relevance.and_then(|n| n.as_u8());
377
378    let minimum_coverage = fm.minimum_coverage.and_then(|n| n.as_u8());
379
380    Ok(LoadedRule {
381        identifier,
382        license_expression,
383        text: parsed.text_content,
384        rule_kind,
385        is_false_positive,
386        is_required_phrase: fm.is_required_phrase.unwrap_or(false),
387        relevance,
388        minimum_coverage,
389        has_stored_minimum_coverage: parsed.has_stored_minimum_coverage,
390        is_continuous: fm.is_continuous.unwrap_or(false),
391        referenced_filenames: LoadedRule::normalize_optional_list(
392            fm.referenced_filenames.as_deref(),
393        ),
394        ignorable_urls: LoadedRule::normalize_optional_list(fm.ignorable_urls.as_deref()),
395        ignorable_emails: LoadedRule::normalize_optional_list(fm.ignorable_emails.as_deref()),
396        ignorable_copyrights: LoadedRule::normalize_optional_list(
397            fm.ignorable_copyrights.as_deref(),
398        ),
399        ignorable_holders: LoadedRule::normalize_optional_list(fm.ignorable_holders.as_deref()),
400        ignorable_authors: LoadedRule::normalize_optional_list(fm.ignorable_authors.as_deref()),
401        language: LoadedRule::normalize_optional_string(fm.language.as_deref()),
402        notes: LoadedRule::normalize_optional_string(fm.notes.as_deref()),
403        is_deprecated: fm.is_deprecated.unwrap_or(false),
404    })
405}
406
407/// Parse a .LICENSE file into a `LoadedLicense` (loader-stage).
408///
409/// This function parses the file and returns a `LoadedLicense` with normalized data.
410/// Deprecated entries are included - filtering is a build-stage concern.
411///
412/// # Arguments
413/// * `path` - Path to the .LICENSE file
414///
415/// # Returns
416/// * `Ok(LoadedLicense)` - Successfully parsed license
417/// * `Err(...)` - Parse error with context
418pub fn parse_license_to_loaded(path: &Path) -> Result<LoadedLicense> {
419    let content = fs::read_to_string(path)
420        .with_context(|| format!("Failed to read license file: {}", path.display()))?;
421
422    let key = LoadedLicense::derive_key(path)?;
423
424    let parsed = parse_license_file_content(&content, path)?;
425
426    let fm: LicenseFrontmatter = serde_yaml::from_str(&parsed.yaml_content).map_err(|e| {
427        anyhow!(
428            "Failed to parse license frontmatter YAML in {}: {}\nContent was:\n{}",
429            path.display(),
430            e,
431            parsed.yaml_content
432        )
433    })?;
434
435    LoadedLicense::validate_key_match(&key, fm.key.as_deref())
436        .map_err(|e| anyhow!("License file has key mismatch: {}: {}", path.display(), e))?;
437
438    let is_deprecated = fm.is_deprecated.unwrap_or(false);
439    let is_unknown = fm.is_unknown.unwrap_or(false);
440    let is_generic = fm.is_generic.unwrap_or(false);
441
442    LoadedLicense::validate_text_content(
443        &parsed.text_content,
444        is_deprecated,
445        is_unknown,
446        is_generic,
447    )
448    .map_err(|e| {
449        anyhow!(
450            "License file has invalid content: {}: {}",
451            path.display(),
452            e
453        )
454    })?;
455
456    let name = LoadedLicense::derive_name(fm.name.as_deref(), fm.short_name.as_deref(), &key);
457
458    let reference_urls = LoadedLicense::merge_reference_urls(
459        fm.text_urls.as_deref(),
460        fm.other_urls.as_deref(),
461        fm.osi_url.as_deref(),
462        fm.faq_url.as_deref(),
463        fm.homepage_url.as_deref(),
464    );
465
466    let minimum_coverage = fm.minimum_coverage.and_then(|n| n.as_u8());
467
468    Ok(LoadedLicense {
469        key,
470        name,
471        spdx_license_key: LoadedLicense::normalize_optional_string(fm.spdx_license_key.as_deref()),
472        other_spdx_license_keys: fm.other_spdx_license_keys.unwrap_or_default(),
473        category: LoadedLicense::normalize_optional_string(fm.category.as_deref()),
474        text: parsed.text_content,
475        reference_urls,
476        notes: LoadedLicense::normalize_optional_string(fm.notes.as_deref()),
477        is_deprecated,
478        replaced_by: fm.replaced_by.unwrap_or_default(),
479        minimum_coverage,
480        ignorable_copyrights: LoadedLicense::normalize_optional_list(
481            fm.ignorable_copyrights.as_deref(),
482        ),
483        ignorable_holders: LoadedLicense::normalize_optional_list(fm.ignorable_holders.as_deref()),
484        ignorable_authors: LoadedLicense::normalize_optional_list(fm.ignorable_authors.as_deref()),
485        ignorable_urls: LoadedLicense::normalize_optional_list(fm.ignorable_urls.as_deref()),
486        ignorable_emails: LoadedLicense::normalize_optional_list(fm.ignorable_emails.as_deref()),
487    })
488}
489
490/// Parse license file content into frontmatter and text sections.
491///
492/// The `path` parameter is used for error messages only.
493fn parse_license_file_content(content: &str, path: &Path) -> Result<ParsedLicenseFile> {
494    if content.len() < 6 {
495        return Err(anyhow!(
496            "License file content too short: {}",
497            path.display()
498        ));
499    }
500
501    let parts: Vec<&str> = FM_BOUNDARY.splitn(content, 3).collect();
502
503    if parts.len() < 3 {
504        let trimmed = content.trim();
505        if trimmed.is_empty() {
506            return Err(anyhow!(
507                "License file is empty or has no content: {}",
508                path.display()
509            ));
510        }
511        return Err(anyhow!(
512            "License file missing delimiter '---': {}",
513            path.display()
514        ));
515    }
516
517    let yaml_content = parts
518        .get(1)
519        .ok_or_else(|| anyhow!("Missing YAML frontmatter in {}", path.display()))?
520        .to_string();
521    let text_content = parts
522        .get(2)
523        .ok_or_else(|| {
524            anyhow!(
525                "Missing text content after frontmatter in {}",
526                path.display()
527            )
528        })?
529        .trim_start_matches('\n')
530        .trim()
531        .to_string();
532
533    Ok(ParsedLicenseFile {
534        yaml_content,
535        text_content,
536    })
537}
538
539/// Load all .RULE files from a directory into `LoadedRule` values (loader-stage).
540///
541/// This function loads ALL rules, including deprecated ones.
542/// Deprecated filtering is a build-stage concern.
543///
544/// # Arguments
545/// * `dir` - Directory containing .RULE files
546///
547/// # Returns
548/// * `Ok(Vec<LoadedRule>)` - All loaded rules (including deprecated)
549/// * `Err(...)` - Directory read error
550pub fn load_loaded_rules_from_directory(dir: &Path) -> Result<Vec<LoadedRule>> {
551    let mut rules = Vec::new();
552
553    let entries = fs::read_dir(dir)
554        .with_context(|| format!("Failed to read rules directory: {}", dir.display()))?;
555
556    for entry in entries {
557        let entry = entry
558            .with_context(|| format!("Failed to read directory entry in: {}", dir.display()))?;
559        let path = entry.path();
560
561        if path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("RULE") {
562            match parse_rule_to_loaded(&path) {
563                Ok(rule) => rules.push(rule),
564                Err(e) => {
565                    eprintln!(
566                        "Warning: Failed to parse rule file {}: {}",
567                        path.display(),
568                        e
569                    );
570                }
571            }
572        }
573    }
574
575    Ok(rules)
576}
577
578/// Load all .LICENSE files from a directory into `LoadedLicense` values (loader-stage).
579///
580/// This function loads ALL licenses, including deprecated ones.
581/// Deprecated filtering is a build-stage concern.
582///
583/// # Arguments
584/// * `dir` - Directory containing .LICENSE files
585///
586/// # Returns
587/// * `Ok(Vec<LoadedLicense>)` - All loaded licenses (including deprecated)
588/// * `Err(...)` - Directory read error
589pub fn load_loaded_licenses_from_directory(dir: &Path) -> Result<Vec<LoadedLicense>> {
590    let mut licenses = Vec::new();
591
592    let entries = fs::read_dir(dir)
593        .with_context(|| format!("Failed to read licenses directory: {}", dir.display()))?;
594
595    for entry in entries {
596        let entry = entry
597            .with_context(|| format!("Failed to read directory entry in: {}", dir.display()))?;
598        let path = entry.path();
599
600        if path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("LICENSE") {
601            match parse_license_to_loaded(&path) {
602                Ok(license) => licenses.push(license),
603                Err(e) => {
604                    eprintln!(
605                        "Warning: Failed to parse license file {}: {}",
606                        path.display(),
607                        e
608                    );
609                }
610            }
611        }
612    }
613
614    Ok(licenses)
615}
616
617/// Validate loaded rules for common issues.
618///
619/// Checks for:
620/// 1. Duplicate rule texts (warns if found)
621/// 2. Empty license expressions for non-false-positive rules (warns if found)
622///
623/// Corresponds to Python:
624/// - `models.py:validate()` for license expression validation
625/// - `index.py:_add_rules()` for duplicate detection via hash
626///
627/// Kept for backward compatibility with `load_rules_from_directory`.
628#[allow(dead_code)]
629fn validate_rules(rules: &[Rule]) {
630    let mut seen_texts: HashSet<&str> = HashSet::new();
631    let mut duplicate_count = 0;
632
633    for rule in rules {
634        if !seen_texts.insert(&rule.text) {
635            warn!(
636                "Duplicate rule text found for license_expression: {}",
637                rule.license_expression
638            );
639            duplicate_count += 1;
640        }
641
642        if !rule.is_false_positive && rule.license_expression.trim().is_empty() {
643            warn!("Rule has empty license_expression but is not marked as false_positive");
644        }
645    }
646
647    if duplicate_count > 0 {
648        warn!(
649            "Found {} duplicate rule text(s) during rule validation",
650            duplicate_count
651        );
652    }
653}
654
655/// Load all .RULE files from a directory into `Rule` values (backward-compatible).
656///
657/// This function loads rules and applies deprecated filtering during loading.
658/// For the two-stage pipeline, prefer `load_loaded_rules_from_directory` and
659/// `build_index_from_loaded`.
660///
661/// Kept for backward compatibility and testing despite not being used in production code.
662/// The new pipeline uses the two-stage loading process instead.
663#[allow(dead_code)]
664pub fn load_rules_from_directory(dir: &Path, with_deprecated: bool) -> Result<Vec<Rule>> {
665    let loaded = load_loaded_rules_from_directory(dir)?;
666    let rules: Vec<Rule> = loaded
667        .into_iter()
668        .filter(|r| with_deprecated || !r.is_deprecated)
669        .map(loaded_rule_to_rule)
670        .collect();
671    validate_rules(&rules);
672    Ok(rules)
673}
674
675/// Load all .LICENSE files from a directory into `License` values (backward-compatible).
676///
677/// This function loads licenses and applies deprecated filtering during loading.
678/// For the two-stage pipeline, prefer `load_loaded_licenses_from_directory` and
679/// `build_index_from_loaded`.
680///
681/// Kept for backward compatibility and testing despite not being used in production code.
682/// The new pipeline uses the two-stage loading process instead.
683#[allow(dead_code)]
684pub fn load_licenses_from_directory(dir: &Path, with_deprecated: bool) -> Result<Vec<License>> {
685    let loaded = load_loaded_licenses_from_directory(dir)?;
686    let licenses: Vec<License> = loaded
687        .into_iter()
688        .filter(|l| with_deprecated || !l.is_deprecated)
689        .map(loaded_license_to_license)
690        .collect();
691    Ok(licenses)
692}
693
694#[cfg(test)]
695mod tests {
696    use super::*;
697    use std::collections::HashMap;
698    use std::fs;
699    use tempfile::tempdir;
700
701    pub fn parse_rule_file(path: &Path) -> Result<Rule> {
702        let loaded = parse_rule_to_loaded(path)?;
703        Ok(loaded_rule_to_rule(loaded))
704    }
705
706    #[test]
707    fn test_parse_number_as_u8() {
708        let num_int: serde_yaml::Number = serde_yaml::from_str("100").unwrap();
709        assert_eq!(num_int.as_u8(), Some(100));
710
711        let num_out_of_range: serde_yaml::Number = serde_yaml::from_str("500").unwrap();
712        assert_eq!(num_out_of_range.as_u8(), None);
713
714        let num_float: serde_yaml::Number = serde_yaml::from_str("90.5").unwrap();
715        assert_eq!(num_float.as_u8(), Some(90));
716    }
717
718    #[test]
719    fn test_parse_simple_license_file() {
720        let dir = tempdir().unwrap();
721        let license_path = dir.path().join("mit.LICENSE");
722        fs::write(
723            &license_path,
724            r#"---
725key: mit
726short_name: MIT License
727name: MIT License
728category: Permissive
729spdx_license_key: MIT
730---
731MIT License text here"#,
732        )
733        .unwrap();
734
735        let license = parse_license_to_loaded(&license_path)
736            .map(loaded_license_to_license)
737            .unwrap();
738        assert_eq!(license.key, "mit");
739        assert_eq!(license.name, "MIT License");
740        assert!(license.text.contains("MIT License text"));
741    }
742
743    #[test]
744    fn test_parse_simple_rule_file() {
745        let dir = tempdir().unwrap();
746        let rule_path = dir.path().join("mit_1.RULE");
747        fs::write(
748            &rule_path,
749            r#"---
750license_expression: mit
751is_license_reference: yes
752relevance: 90
753referenced_filenames:
754    - MIT.txt
755---
756MIT.txt"#,
757        )
758        .unwrap();
759
760        let rule = parse_rule_file(&rule_path).unwrap();
761        assert_eq!(rule.license_expression, "mit");
762        assert_eq!(rule.text, "MIT.txt");
763        assert!(rule.is_license_reference());
764        assert_eq!(rule.relevance, 90);
765    }
766
767    #[test]
768    fn test_deserialize_yes_no_bool() {
769        let dir = tempdir().unwrap();
770        let rule_path = dir.path().join("test.RULE");
771
772        fs::write(
773            &rule_path,
774            r#"---
775license_expression: mit
776is_license_notice: yes
777is_license_tag: no
778---
779MIT License"#,
780        )
781        .unwrap();
782
783        let rule = parse_rule_file(&rule_path).unwrap();
784        assert!(rule.is_license_notice());
785        assert!(!rule.is_license_tag());
786    }
787
788    #[test]
789    fn test_load_licenses_from_directory() {
790        let dir = tempdir().unwrap();
791
792        fs::write(
793            dir.path().join("test.LICENSE"),
794            r#"---
795key: test
796name: Test License
797spdx_license_key: TEST
798category: Permissive
799---
800Test license text here"#,
801        )
802        .unwrap();
803
804        let licenses = load_licenses_from_directory(dir.path(), false).unwrap();
805        assert_eq!(licenses.len(), 1);
806
807        let license = &licenses[0];
808        assert_eq!(license.key, "test");
809        assert_eq!(license.name, "Test License");
810        assert_eq!(license.spdx_license_key, Some("TEST".to_string()));
811        assert!(!license.text.is_empty());
812    }
813
814    #[test]
815    fn test_load_rules_from_directory() {
816        let dir = tempdir().unwrap();
817
818        fs::write(
819            dir.path().join("test_1.RULE"),
820            r#"---
821license_expression: test
822is_license_reference: yes
823relevance: 85
824referenced_filenames:
825    - TEST.txt
826---
827TEST.txt"#,
828        )
829        .unwrap();
830
831        let rules = load_rules_from_directory(dir.path(), false).unwrap();
832        assert_eq!(rules.len(), 1);
833
834        let rule = &rules[0];
835        assert_eq!(rule.license_expression, "test");
836        assert!(rule.is_license_reference());
837        assert_eq!(rule.relevance, 85);
838    }
839
840    #[test]
841    fn test_validate_rules_detects_duplicates() {
842        let rules = vec![
843            Rule {
844                identifier: "mit.LICENSE".to_string(),
845                license_expression: "mit".to_string(),
846                text: "MIT License".to_string(),
847                tokens: vec![],
848                rule_kind: crate::license_detection::models::RuleKind::Text,
849                is_false_positive: false,
850                is_required_phrase: false,
851                is_from_license: false,
852                relevance: 100,
853                minimum_coverage: None,
854                has_stored_minimum_coverage: false,
855                is_continuous: false,
856                required_phrase_spans: vec![],
857                stopwords_by_pos: HashMap::new(),
858                referenced_filenames: None,
859                ignorable_urls: None,
860                ignorable_emails: None,
861                ignorable_copyrights: None,
862                ignorable_holders: None,
863                ignorable_authors: None,
864                language: None,
865                notes: None,
866                length_unique: 0,
867                high_length_unique: 0,
868                high_length: 0,
869                min_matched_length: 0,
870                min_high_matched_length: 0,
871                min_matched_length_unique: 0,
872                min_high_matched_length_unique: 0,
873                is_small: false,
874                is_tiny: false,
875                starts_with_license: false,
876                ends_with_license: false,
877                is_deprecated: false,
878                spdx_license_key: None,
879                other_spdx_license_keys: vec![],
880            },
881            Rule {
882                identifier: "apache-2.0.LICENSE".to_string(),
883                license_expression: "apache-2.0".to_string(),
884                text: "MIT License".to_string(),
885                tokens: vec![],
886                rule_kind: crate::license_detection::models::RuleKind::Text,
887                is_false_positive: false,
888                is_required_phrase: false,
889                is_from_license: false,
890                relevance: 100,
891                minimum_coverage: None,
892                has_stored_minimum_coverage: false,
893                is_continuous: false,
894                required_phrase_spans: vec![],
895                stopwords_by_pos: HashMap::new(),
896                referenced_filenames: None,
897                ignorable_urls: None,
898                ignorable_emails: None,
899                ignorable_copyrights: None,
900                ignorable_holders: None,
901                ignorable_authors: None,
902                language: None,
903                notes: None,
904                length_unique: 0,
905                high_length_unique: 0,
906                high_length: 0,
907                min_matched_length: 0,
908                min_high_matched_length: 0,
909                min_matched_length_unique: 0,
910                min_high_matched_length_unique: 0,
911                is_small: false,
912                is_tiny: false,
913                starts_with_license: false,
914                ends_with_license: false,
915                is_deprecated: false,
916                spdx_license_key: None,
917                other_spdx_license_keys: vec![],
918            },
919        ];
920
921        validate_rules(&rules);
922    }
923
924    #[test]
925    fn test_validate_rules_accepts_false_positive_without_expression() {
926        let rules = vec![Rule {
927            identifier: "fp.RULE".to_string(),
928            license_expression: "".to_string(),
929            text: "Some text".to_string(),
930            tokens: vec![],
931            rule_kind: crate::license_detection::models::RuleKind::None,
932            is_false_positive: true,
933            is_required_phrase: false,
934            is_from_license: false,
935            relevance: 100,
936            minimum_coverage: None,
937            has_stored_minimum_coverage: false,
938            is_continuous: false,
939            required_phrase_spans: vec![],
940            stopwords_by_pos: HashMap::new(),
941            referenced_filenames: None,
942            ignorable_urls: None,
943            ignorable_emails: None,
944            ignorable_copyrights: None,
945            ignorable_holders: None,
946            ignorable_authors: None,
947            language: None,
948            notes: Some("False positive for common pattern".to_string()),
949            length_unique: 0,
950            high_length_unique: 0,
951            high_length: 0,
952            min_matched_length: 0,
953            min_high_matched_length: 0,
954            min_matched_length_unique: 0,
955            min_high_matched_length_unique: 0,
956            is_small: false,
957            is_tiny: false,
958            starts_with_license: false,
959            ends_with_license: false,
960            is_deprecated: false,
961            spdx_license_key: None,
962            other_spdx_license_keys: vec![],
963        }];
964
965        validate_rules(&rules);
966    }
967
968    #[test]
969    fn test_validate_rules_no_duplicates() {
970        let rules = vec![
971            Rule {
972                identifier: "mit.LICENSE".to_string(),
973                license_expression: "mit".to_string(),
974                text: "MIT License".to_string(),
975                tokens: vec![],
976                rule_kind: crate::license_detection::models::RuleKind::Text,
977                is_false_positive: false,
978                is_required_phrase: false,
979                is_from_license: false,
980                relevance: 100,
981                minimum_coverage: None,
982                has_stored_minimum_coverage: false,
983                is_continuous: false,
984                required_phrase_spans: vec![],
985                stopwords_by_pos: HashMap::new(),
986                referenced_filenames: None,
987                ignorable_urls: None,
988                ignorable_emails: None,
989                ignorable_copyrights: None,
990                ignorable_holders: None,
991                ignorable_authors: None,
992                language: None,
993                notes: None,
994                length_unique: 0,
995                high_length_unique: 0,
996                high_length: 0,
997                min_matched_length: 0,
998                min_high_matched_length: 0,
999                min_matched_length_unique: 0,
1000                min_high_matched_length_unique: 0,
1001                is_small: false,
1002                is_tiny: false,
1003                starts_with_license: false,
1004                ends_with_license: false,
1005                is_deprecated: false,
1006                spdx_license_key: None,
1007                other_spdx_license_keys: vec![],
1008            },
1009            Rule {
1010                identifier: "apache-2.0.LICENSE".to_string(),
1011                license_expression: "apache-2.0".to_string(),
1012                text: "Apache License".to_string(),
1013                tokens: vec![],
1014                rule_kind: crate::license_detection::models::RuleKind::Text,
1015                is_false_positive: false,
1016                is_required_phrase: false,
1017                is_from_license: false,
1018                relevance: 100,
1019                minimum_coverage: None,
1020                has_stored_minimum_coverage: false,
1021                is_continuous: false,
1022                required_phrase_spans: vec![],
1023                stopwords_by_pos: HashMap::new(),
1024                referenced_filenames: None,
1025                ignorable_urls: None,
1026                ignorable_emails: None,
1027                ignorable_copyrights: None,
1028                ignorable_holders: None,
1029                ignorable_authors: None,
1030                language: None,
1031                notes: None,
1032                length_unique: 0,
1033                high_length_unique: 0,
1034                high_length: 0,
1035                min_matched_length: 0,
1036                min_high_matched_length: 0,
1037                min_matched_length_unique: 0,
1038                min_high_matched_length_unique: 0,
1039                is_small: false,
1040                is_tiny: false,
1041                starts_with_license: false,
1042                ends_with_license: false,
1043                is_deprecated: false,
1044                spdx_license_key: None,
1045                other_spdx_license_keys: vec![],
1046            },
1047        ];
1048
1049        validate_rules(&rules);
1050    }
1051
1052    #[test]
1053    fn test_load_licenses_filters_deprecated_by_default() {
1054        let dir = tempdir().unwrap();
1055
1056        fs::write(
1057            dir.path().join("active.LICENSE"),
1058            r#"---
1059key: active
1060name: Active License
1061---
1062Active license text"#,
1063        )
1064        .unwrap();
1065
1066        fs::write(
1067            dir.path().join("deprecated.LICENSE"),
1068            r#"---
1069key: deprecated
1070name: Deprecated License
1071is_deprecated: yes
1072---
1073Deprecated license text"#,
1074        )
1075        .unwrap();
1076
1077        let licenses_without = load_licenses_from_directory(dir.path(), false).unwrap();
1078        assert_eq!(licenses_without.len(), 1);
1079        assert_eq!(licenses_without[0].key, "active");
1080
1081        let licenses_with = load_licenses_from_directory(dir.path(), true).unwrap();
1082        assert_eq!(licenses_with.len(), 2);
1083    }
1084
1085    #[test]
1086    fn test_load_rules_filters_deprecated_by_default() {
1087        let dir = tempdir().unwrap();
1088
1089        fs::write(
1090            dir.path().join("active.RULE"),
1091            r#"---
1092license_expression: active
1093is_license_notice: yes
1094---
1095Active rule text"#,
1096        )
1097        .unwrap();
1098
1099        fs::write(
1100            dir.path().join("deprecated.RULE"),
1101            r#"---
1102license_expression: deprecated
1103is_license_notice: yes
1104is_deprecated: yes
1105---
1106Deprecated rule text"#,
1107        )
1108        .unwrap();
1109
1110        let rules_without = load_rules_from_directory(dir.path(), false).unwrap();
1111        assert_eq!(rules_without.len(), 1);
1112        assert_eq!(rules_without[0].license_expression, "active");
1113
1114        let rules_with = load_rules_from_directory(dir.path(), true).unwrap();
1115        assert_eq!(rules_with.len(), 2);
1116    }
1117
1118    #[test]
1119    fn test_parse_rule_to_loaded() {
1120        let dir = tempdir().unwrap();
1121        let rule_path = dir.path().join("mit_1.RULE");
1122        fs::write(
1123            &rule_path,
1124            r#"---
1125license_expression: mit
1126is_license_reference: yes
1127relevance: 90
1128referenced_filenames:
1129    - MIT.txt
1130---
1131MIT.txt"#,
1132        )
1133        .unwrap();
1134
1135        let loaded = parse_rule_to_loaded(&rule_path).unwrap();
1136        assert_eq!(loaded.identifier, "mit_1.RULE");
1137        assert_eq!(loaded.license_expression, "mit");
1138        assert_eq!(loaded.text, "MIT.txt");
1139        assert_eq!(
1140            loaded.rule_kind,
1141            crate::license_detection::models::RuleKind::Reference
1142        );
1143        assert_eq!(loaded.relevance, Some(90));
1144        assert_eq!(
1145            loaded.referenced_filenames,
1146            Some(vec!["MIT.txt".to_string()])
1147        );
1148        assert!(!loaded.is_deprecated);
1149    }
1150
1151    #[test]
1152    fn test_parse_license_to_loaded() {
1153        let dir = tempdir().unwrap();
1154        let license_path = dir.path().join("mit.LICENSE");
1155        fs::write(
1156            &license_path,
1157            r#"---
1158key: mit
1159short_name: MIT License
1160name: MIT License
1161category: Permissive
1162spdx_license_key: MIT
1163---
1164MIT License text here"#,
1165        )
1166        .unwrap();
1167
1168        let loaded = parse_license_to_loaded(&license_path).unwrap();
1169        assert_eq!(loaded.key, "mit");
1170        assert_eq!(loaded.name, "MIT License");
1171        assert!(loaded.text.contains("MIT License text"));
1172        assert_eq!(loaded.spdx_license_key, Some("MIT".to_string()));
1173    }
1174
1175    #[test]
1176    fn test_load_loaded_rules_from_directory_includes_deprecated() {
1177        let dir = tempdir().unwrap();
1178
1179        fs::write(
1180            dir.path().join("active.RULE"),
1181            r#"---
1182license_expression: active
1183is_license_notice: yes
1184---
1185Active rule text"#,
1186        )
1187        .unwrap();
1188
1189        fs::write(
1190            dir.path().join("deprecated.RULE"),
1191            r#"---
1192license_expression: deprecated
1193is_license_notice: yes
1194is_deprecated: yes
1195---
1196Deprecated rule text"#,
1197        )
1198        .unwrap();
1199
1200        let loaded_rules = load_loaded_rules_from_directory(dir.path()).unwrap();
1201        assert_eq!(loaded_rules.len(), 2);
1202
1203        let active = loaded_rules
1204            .iter()
1205            .find(|r| r.license_expression == "active")
1206            .unwrap();
1207        assert!(!active.is_deprecated);
1208
1209        let deprecated = loaded_rules
1210            .iter()
1211            .find(|r| r.license_expression == "deprecated")
1212            .unwrap();
1213        assert!(deprecated.is_deprecated);
1214    }
1215
1216    #[test]
1217    fn test_load_loaded_licenses_from_directory_includes_deprecated() {
1218        let dir = tempdir().unwrap();
1219
1220        fs::write(
1221            dir.path().join("active.LICENSE"),
1222            r#"---
1223key: active
1224name: Active License
1225---
1226Active license text"#,
1227        )
1228        .unwrap();
1229
1230        fs::write(
1231            dir.path().join("deprecated.LICENSE"),
1232            r#"---
1233key: deprecated
1234name: Deprecated License
1235is_deprecated: yes
1236---
1237Deprecated license text"#,
1238        )
1239        .unwrap();
1240
1241        let loaded_licenses = load_loaded_licenses_from_directory(dir.path()).unwrap();
1242        assert_eq!(loaded_licenses.len(), 2);
1243
1244        let active = loaded_licenses.iter().find(|l| l.key == "active").unwrap();
1245        assert!(!active.is_deprecated);
1246
1247        let deprecated = loaded_licenses
1248            .iter()
1249            .find(|l| l.key == "deprecated")
1250            .unwrap();
1251        assert!(deprecated.is_deprecated);
1252    }
1253}