Skip to main content

provenant/license_detection/rules/
loader.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Parse .LICENSE and .RULE files.
5//!
6//! This module provides two-stage loading:
7//! 1. Loader-stage: Parse files into `LoadedRule` and `LoadedLicense`
8//! 2. Build-stage: Convert to runtime `Rule` and `License` (deprecated filtering, etc.)
9//!
10//! The loader-stage functions (`parse_rule_to_loaded`, `parse_license_to_loaded`,
11//! `load_loaded_rules_from_directory`, `load_loaded_licenses_from_directory`) return
12//! all entries including deprecated ones. Deprecated filtering is a build-stage concern.
13
14use crate::license_detection::index::{loaded_license_to_license, loaded_rule_to_rule};
15use crate::license_detection::models::{License, LoadedLicense, LoadedRule, Rule};
16use anyhow::{Context, Result, anyhow};
17use log::warn;
18use regex::Regex;
19use serde::{Deserialize, Deserializer, Serialize};
20use std::collections::HashSet;
21use std::fs;
22use std::path::Path;
23use std::sync::LazyLock;
24
25static FM_BOUNDARY: LazyLock<Regex> =
26    LazyLock::new(|| Regex::new(r"(?m)^-{3,}\s*$").expect("Invalid frontmatter regex"));
27
28fn deserialize_yes_no_bool<'de, D>(deserializer: D) -> Result<Option<bool>, D::Error>
29where
30    D: Deserializer<'de>,
31{
32    #[derive(Deserialize, Serialize)]
33    #[serde(untagged)]
34    enum YesNoOrBool {
35        String(String),
36        Bool(bool),
37    }
38
39    match YesNoOrBool::deserialize(deserializer)? {
40        YesNoOrBool::Bool(b) => Ok(Some(b)),
41        YesNoOrBool::String(s) => {
42            let lower = s.to_lowercase();
43            if lower == "yes" || lower == "true" || lower == "1" {
44                Ok(Some(true))
45            } else if lower == "no" || lower == "false" || lower == "0" {
46                Ok(Some(false))
47            } else {
48                Ok(None)
49            }
50        }
51    }
52}
53
54trait ParseNumber {
55    fn as_u8(&self) -> Option<u8>;
56}
57
58impl ParseNumber for yaml_serde::Number {
59    fn as_u8(&self) -> Option<u8> {
60        self.as_i64()
61            .and_then(|n| u8::try_from(n).ok())
62            .or_else(|| {
63                self.as_f64().and_then(|f| {
64                    if f >= 0.0 && f <= f64::from(u8::MAX) {
65                        // truncation toward zero is intentional (e.g. 90.5 → 90)
66                        #[allow(clippy::cast_sign_loss)]
67                        Some(f as u8)
68                    } else {
69                        None
70                    }
71                })
72            })
73    }
74}
75
76#[derive(Debug, Deserialize)]
77#[allow(dead_code)]
78struct LicenseFrontmatter {
79    #[serde(default)]
80    key: Option<String>,
81
82    #[serde(default)]
83    short_name: Option<String>,
84
85    #[serde(default)]
86    name: Option<String>,
87
88    #[serde(default)]
89    category: Option<String>,
90
91    #[serde(default)]
92    owner: Option<String>,
93
94    #[serde(default)]
95    homepage_url: Option<String>,
96
97    #[serde(default)]
98    notes: Option<String>,
99
100    #[serde(default)]
101    spdx_license_key: Option<String>,
102
103    #[serde(default)]
104    other_spdx_license_keys: Option<Vec<String>>,
105
106    #[serde(default)]
107    osi_license_key: Option<String>,
108
109    #[serde(default)]
110    text_urls: Option<Vec<String>>,
111
112    #[serde(default)]
113    osi_url: Option<String>,
114
115    #[serde(default)]
116    faq_url: Option<String>,
117
118    #[serde(default)]
119    other_urls: Option<Vec<String>>,
120
121    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
122    is_deprecated: Option<bool>,
123
124    #[serde(default)]
125    replaced_by: Option<Vec<String>>,
126
127    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
128    is_exception: Option<bool>,
129
130    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
131    is_unknown: Option<bool>,
132
133    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
134    is_generic: Option<bool>,
135
136    #[serde(default)]
137    minimum_coverage: Option<yaml_serde::Number>,
138
139    #[serde(default)]
140    standard_notice: Option<String>,
141
142    #[serde(default)]
143    ignorable_copyrights: Option<Vec<String>>,
144
145    #[serde(default)]
146    ignorable_holders: Option<Vec<String>>,
147
148    #[serde(default)]
149    ignorable_authors: Option<Vec<String>>,
150
151    #[serde(default)]
152    ignorable_urls: Option<Vec<String>>,
153
154    #[serde(default)]
155    ignorable_emails: Option<Vec<String>>,
156}
157
158/// Parsed rule file content, split into frontmatter and text.
159struct ParsedRuleFile {
160    yaml_content: String,
161    text_content: String,
162    has_stored_minimum_coverage: bool,
163}
164
165/// Parsed license file content, split into frontmatter and text.
166struct ParsedLicenseFile {
167    yaml_content: String,
168    text_content: String,
169}
170
171/// Parse file content into frontmatter and text sections.
172///
173/// Returns `ParsedRuleFile` with yaml_content, text_content, and metadata.
174/// The `path` parameter is used for error messages only.
175fn parse_file_content(content: &str, path: &Path) -> Result<ParsedRuleFile> {
176    if content.len() < 6 {
177        return Err(anyhow!("File content too short: {}", path.display()));
178    }
179
180    let parts: Vec<&str> = FM_BOUNDARY.splitn(content, 3).collect();
181
182    if parts.len() < 3 {
183        let trimmed = content.trim();
184        if trimmed.is_empty() {
185            return Err(anyhow!(
186                "File is empty or has no content: {}",
187                path.display()
188            ));
189        }
190        return Err(anyhow!("File missing delimiter '---': {}", path.display()));
191    }
192
193    let yaml_content = parts
194        .get(1)
195        .ok_or_else(|| anyhow!("Missing YAML frontmatter in {}", path.display()))?
196        .to_string();
197    let text_content = parts
198        .get(2)
199        .ok_or_else(|| {
200            anyhow!(
201                "Missing text content after frontmatter in {}",
202                path.display()
203            )
204        })?
205        .trim_start_matches('\n')
206        .trim()
207        .to_string();
208
209    let frontmatter_value: yaml_serde::Value =
210        yaml_serde::from_str(&yaml_content).map_err(|e| {
211            anyhow!(
212                "Failed to parse frontmatter YAML in {}: {}\nContent was:\n{}",
213                path.display(),
214                e,
215                yaml_content
216            )
217        })?;
218
219    let has_stored_minimum_coverage = frontmatter_value.as_mapping().is_some_and(|mapping| {
220        mapping.contains_key(yaml_serde::Value::String("minimum_coverage".to_string()))
221    });
222
223    Ok(ParsedRuleFile {
224        yaml_content,
225        text_content,
226        has_stored_minimum_coverage,
227    })
228}
229
230#[derive(Debug, Deserialize)]
231#[allow(dead_code)]
232struct RuleFrontmatter {
233    #[serde(default)]
234    license_expression: Option<String>,
235
236    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
237    is_license_text: Option<bool>,
238
239    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
240    is_license_notice: Option<bool>,
241
242    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
243    is_license_reference: Option<bool>,
244
245    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
246    is_license_tag: Option<bool>,
247
248    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
249    is_license_intro: Option<bool>,
250
251    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
252    is_license_clue: Option<bool>,
253
254    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
255    is_false_positive: Option<bool>,
256
257    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
258    is_required_phrase: Option<bool>,
259
260    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
261    skip_for_required_phrase_generation: Option<bool>,
262
263    #[serde(default)]
264    relevance: Option<yaml_serde::Number>,
265
266    #[serde(default)]
267    minimum_coverage: Option<yaml_serde::Number>,
268
269    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
270    is_continuous: Option<bool>,
271
272    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
273    is_deprecated: Option<bool>,
274
275    #[serde(default)]
276    referenced_filenames: Option<Vec<String>>,
277
278    #[serde(default)]
279    replaced_by: Option<Vec<String>>,
280
281    #[serde(default)]
282    ignorable_urls: Option<Vec<String>>,
283
284    #[serde(default)]
285    ignorable_emails: Option<Vec<String>>,
286
287    #[serde(default)]
288    notes: Option<String>,
289
290    #[serde(default)]
291    ignorable_copyrights: Option<Vec<String>>,
292
293    #[serde(default)]
294    ignorable_holders: Option<Vec<String>>,
295
296    #[serde(default)]
297    ignorable_authors: Option<Vec<String>>,
298
299    #[serde(default)]
300    language: Option<String>,
301}
302
303fn parse_rule_source_to_loaded(
304    identifier: &str,
305    content: &str,
306    source_path: &Path,
307) -> Result<LoadedRule> {
308    let identifier = LoadedRule::derive_identifier(
309        source_path
310            .file_name()
311            .and_then(|s| s.to_str())
312            .unwrap_or(identifier),
313    );
314
315    let parsed = parse_file_content(content, source_path)?;
316
317    if parsed.text_content.is_empty() {
318        return Err(anyhow!(
319            "Rule file has empty text content: {}",
320            source_path.display()
321        ));
322    }
323
324    let fm: RuleFrontmatter = yaml_serde::from_str(&parsed.yaml_content).map_err(|e| {
325        anyhow!(
326            "Failed to parse rule frontmatter YAML in {}: {}\nContent was:\n{}",
327            source_path.display(),
328            e,
329            parsed.yaml_content
330        )
331    })?;
332
333    let is_false_positive = fm.is_false_positive.unwrap_or(false);
334
335    let rule_kind = LoadedRule::derive_rule_kind(
336        fm.is_license_text.unwrap_or(false),
337        fm.is_license_notice.unwrap_or(false),
338        fm.is_license_reference.unwrap_or(false),
339        fm.is_license_tag.unwrap_or(false),
340        fm.is_license_intro.unwrap_or(false),
341        fm.is_license_clue.unwrap_or(false),
342    )
343    .map_err(|e| {
344        anyhow!(
345            "Rule file has invalid rule-kind flags: {}: {}",
346            source_path.display(),
347            e
348        )
349    })?;
350
351    LoadedRule::validate_rule_kind_flags(rule_kind, is_false_positive).map_err(|e| {
352        anyhow!(
353            "Rule file has invalid flags: {}: {}",
354            source_path.display(),
355            e
356        )
357    })?;
358
359    let license_expression = LoadedRule::normalize_license_expression(
360        fm.license_expression.as_deref(),
361        is_false_positive,
362    )
363    .map_err(|e| {
364        anyhow!(
365            "Rule file has invalid license_expression: {}: {}",
366            source_path.display(),
367            e
368        )
369    })?;
370
371    let relevance = fm.relevance.and_then(|n| n.as_u8());
372
373    let minimum_coverage = fm.minimum_coverage.and_then(|n| n.as_u8());
374
375    Ok(LoadedRule {
376        identifier,
377        license_expression,
378        text: parsed.text_content,
379        rule_kind,
380        is_false_positive,
381        is_required_phrase: fm.is_required_phrase.unwrap_or(false),
382        skip_for_required_phrase_generation: fm
383            .skip_for_required_phrase_generation
384            .unwrap_or(false),
385        relevance,
386        minimum_coverage,
387        has_stored_minimum_coverage: parsed.has_stored_minimum_coverage,
388        is_continuous: fm.is_continuous.unwrap_or(false),
389        referenced_filenames: LoadedRule::normalize_optional_list(
390            fm.referenced_filenames.as_deref(),
391        ),
392        ignorable_urls: LoadedRule::normalize_optional_list(fm.ignorable_urls.as_deref()),
393        ignorable_emails: LoadedRule::normalize_optional_list(fm.ignorable_emails.as_deref()),
394        ignorable_copyrights: LoadedRule::normalize_optional_list(
395            fm.ignorable_copyrights.as_deref(),
396        ),
397        ignorable_holders: LoadedRule::normalize_optional_list(fm.ignorable_holders.as_deref()),
398        ignorable_authors: LoadedRule::normalize_optional_list(fm.ignorable_authors.as_deref()),
399        language: LoadedRule::normalize_optional_string(fm.language.as_deref()),
400        notes: LoadedRule::normalize_optional_string(fm.notes.as_deref()),
401        is_deprecated: fm.is_deprecated.unwrap_or(false),
402        replaced_by: fm.replaced_by.unwrap_or_default(),
403    })
404}
405
406/// Parse a .RULE file into a `LoadedRule` (loader-stage).
407///
408/// This function parses the file and returns a `LoadedRule` with normalized data.
409/// Deprecated entries are included - filtering is a build-stage concern.
410pub fn parse_rule_to_loaded(path: &Path) -> Result<LoadedRule> {
411    let content = fs::read_to_string(path)
412        .with_context(|| format!("Failed to read rule file: {}", path.display()))?;
413    parse_rule_source_to_loaded(
414        path.file_name()
415            .and_then(|s| s.to_str())
416            .unwrap_or("unknown.RULE"),
417        &content,
418        path,
419    )
420}
421
422/// Parse a rule from in-memory ScanCode-style `.RULE` content.
423pub fn parse_rule_str_to_loaded(identifier: &str, content: &str) -> Result<LoadedRule> {
424    let synthetic_path = Path::new(identifier);
425    parse_rule_source_to_loaded(identifier, content, synthetic_path)
426}
427
428fn parse_license_source_to_loaded(
429    filename: &str,
430    content: &str,
431    source_path: &Path,
432) -> Result<LoadedLicense> {
433    let key = LoadedLicense::derive_key(Path::new(filename))?;
434
435    let parsed = parse_license_file_content(content, source_path)?;
436
437    let fm: LicenseFrontmatter = yaml_serde::from_str(&parsed.yaml_content).map_err(|e| {
438        anyhow!(
439            "Failed to parse license frontmatter YAML in {}: {}\nContent was:\n{}",
440            source_path.display(),
441            e,
442            parsed.yaml_content
443        )
444    })?;
445
446    LoadedLicense::validate_key_match(&key, fm.key.as_deref()).map_err(|e| {
447        anyhow!(
448            "License file has key mismatch: {}: {}",
449            source_path.display(),
450            e
451        )
452    })?;
453
454    let is_deprecated = fm.is_deprecated.unwrap_or(false);
455    let is_unknown = fm.is_unknown.unwrap_or(false);
456    let is_generic = fm.is_generic.unwrap_or(false);
457
458    LoadedLicense::validate_text_content(
459        &parsed.text_content,
460        is_deprecated,
461        is_unknown,
462        is_generic,
463    )
464    .map_err(|e| {
465        anyhow!(
466            "License file has invalid content: {}: {}",
467            source_path.display(),
468            e
469        )
470    })?;
471
472    let name = LoadedLicense::derive_name(fm.name.as_deref(), fm.short_name.as_deref(), &key);
473
474    let reference_urls = LoadedLicense::merge_reference_urls(
475        fm.text_urls.as_deref(),
476        fm.other_urls.as_deref(),
477        fm.osi_url.as_deref(),
478        fm.faq_url.as_deref(),
479        fm.homepage_url.as_deref(),
480    );
481
482    let minimum_coverage = fm.minimum_coverage.and_then(|n| n.as_u8());
483
484    Ok(LoadedLicense {
485        key,
486        short_name: LoadedLicense::normalize_optional_string(fm.short_name.as_deref()),
487        name,
488        language: Some("en".to_string()),
489        spdx_license_key: LoadedLicense::normalize_optional_string(fm.spdx_license_key.as_deref()),
490        other_spdx_license_keys: fm.other_spdx_license_keys.unwrap_or_default(),
491        category: LoadedLicense::normalize_optional_string(fm.category.as_deref()),
492        owner: LoadedLicense::normalize_optional_string(fm.owner.as_deref()),
493        homepage_url: LoadedLicense::normalize_optional_string(fm.homepage_url.as_deref()),
494        text: parsed.text_content,
495        reference_urls,
496        osi_license_key: LoadedLicense::normalize_optional_string(fm.osi_license_key.as_deref()),
497        text_urls: LoadedLicense::normalize_optional_list(fm.text_urls.as_deref())
498            .unwrap_or_default(),
499        osi_url: LoadedLicense::normalize_optional_string(fm.osi_url.as_deref()),
500        faq_url: LoadedLicense::normalize_optional_string(fm.faq_url.as_deref()),
501        other_urls: LoadedLicense::normalize_optional_list(fm.other_urls.as_deref())
502            .unwrap_or_default(),
503        notes: LoadedLicense::normalize_optional_string(fm.notes.as_deref()),
504        is_deprecated,
505        is_exception: fm.is_exception.unwrap_or(false),
506        is_unknown,
507        is_generic,
508        replaced_by: fm.replaced_by.unwrap_or_default(),
509        minimum_coverage,
510        standard_notice: LoadedLicense::normalize_optional_string(fm.standard_notice.as_deref()),
511        ignorable_copyrights: LoadedLicense::normalize_optional_list(
512            fm.ignorable_copyrights.as_deref(),
513        ),
514        ignorable_holders: LoadedLicense::normalize_optional_list(fm.ignorable_holders.as_deref()),
515        ignorable_authors: LoadedLicense::normalize_optional_list(fm.ignorable_authors.as_deref()),
516        ignorable_urls: LoadedLicense::normalize_optional_list(fm.ignorable_urls.as_deref()),
517        ignorable_emails: LoadedLicense::normalize_optional_list(fm.ignorable_emails.as_deref()),
518    })
519}
520
521/// Parse a .LICENSE file into a `LoadedLicense` (loader-stage).
522///
523/// This function parses the file and returns a `LoadedLicense` with normalized data.
524/// Deprecated entries are included - filtering is a build-stage concern.
525pub fn parse_license_to_loaded(path: &Path) -> Result<LoadedLicense> {
526    let content = fs::read_to_string(path)
527        .with_context(|| format!("Failed to read license file: {}", path.display()))?;
528    parse_license_source_to_loaded(
529        path.file_name()
530            .and_then(|s| s.to_str())
531            .unwrap_or("unknown.LICENSE"),
532        &content,
533        path,
534    )
535}
536
537/// Parse a license from in-memory ScanCode-style `.LICENSE` content.
538pub fn parse_license_str_to_loaded(filename: &str, content: &str) -> Result<LoadedLicense> {
539    let synthetic_path = Path::new(filename);
540    parse_license_source_to_loaded(filename, content, synthetic_path)
541}
542
543/// Parse license file content into frontmatter and text sections.
544///
545/// The `path` parameter is used for error messages only.
546fn parse_license_file_content(content: &str, path: &Path) -> Result<ParsedLicenseFile> {
547    if content.len() < 6 {
548        return Err(anyhow!(
549            "License file content too short: {}",
550            path.display()
551        ));
552    }
553
554    let parts: Vec<&str> = FM_BOUNDARY.splitn(content, 3).collect();
555
556    if parts.len() < 3 {
557        let trimmed = content.trim();
558        if trimmed.is_empty() {
559            return Err(anyhow!(
560                "License file is empty or has no content: {}",
561                path.display()
562            ));
563        }
564        return Err(anyhow!(
565            "License file missing delimiter '---': {}",
566            path.display()
567        ));
568    }
569
570    let yaml_content = parts
571        .get(1)
572        .ok_or_else(|| anyhow!("Missing YAML frontmatter in {}", path.display()))?
573        .to_string();
574    let text_content = parts
575        .get(2)
576        .ok_or_else(|| {
577            anyhow!(
578                "Missing text content after frontmatter in {}",
579                path.display()
580            )
581        })?
582        .trim_start_matches('\n')
583        .trim()
584        .to_string();
585
586    Ok(ParsedLicenseFile {
587        yaml_content,
588        text_content,
589    })
590}
591
592/// Load all .RULE files from a directory into `LoadedRule` values (loader-stage).
593///
594/// This function loads ALL rules, including deprecated ones.
595/// Deprecated filtering is a build-stage concern.
596///
597/// # Arguments
598/// * `dir` - Directory containing .RULE files
599///
600/// # Returns
601/// * `Ok(Vec<LoadedRule>)` - All loaded rules (including deprecated)
602/// * `Err(...)` - Directory read error
603pub fn load_loaded_rules_from_directory(dir: &Path) -> Result<Vec<LoadedRule>> {
604    let mut rules = Vec::new();
605
606    let entries = fs::read_dir(dir)
607        .with_context(|| format!("Failed to read rules directory: {}", dir.display()))?;
608
609    for entry in entries {
610        let entry = entry
611            .with_context(|| format!("Failed to read directory entry in: {}", dir.display()))?;
612        let path = entry.path();
613
614        if path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("RULE") {
615            match parse_rule_to_loaded(&path) {
616                Ok(rule) => rules.push(rule),
617                Err(e) => {
618                    warn!("Failed to parse rule file {}: {}", path.display(), e);
619                }
620            }
621        }
622    }
623
624    Ok(rules)
625}
626
627/// Load all .LICENSE files from a directory into `LoadedLicense` values (loader-stage).
628///
629/// This function loads ALL licenses, including deprecated ones.
630/// Deprecated filtering is a build-stage concern.
631///
632/// # Arguments
633/// * `dir` - Directory containing .LICENSE files
634///
635/// # Returns
636/// * `Ok(Vec<LoadedLicense>)` - All loaded licenses (including deprecated)
637/// * `Err(...)` - Directory read error
638pub fn load_loaded_licenses_from_directory(dir: &Path) -> Result<Vec<LoadedLicense>> {
639    let mut licenses = Vec::new();
640
641    let entries = fs::read_dir(dir)
642        .with_context(|| format!("Failed to read licenses directory: {}", dir.display()))?;
643
644    for entry in entries {
645        let entry = entry
646            .with_context(|| format!("Failed to read directory entry in: {}", dir.display()))?;
647        let path = entry.path();
648
649        if path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("LICENSE") {
650            match parse_license_to_loaded(&path) {
651                Ok(license) => licenses.push(license),
652                Err(e) => {
653                    warn!("Failed to parse license file {}: {}", path.display(), e);
654                }
655            }
656        }
657    }
658
659    Ok(licenses)
660}
661
662/// Validate loaded rules for common issues.
663///
664/// Checks for:
665/// 1. Duplicate rule texts (warns if found)
666/// 2. Empty license expressions for non-false-positive rules (warns if found)
667///
668/// Corresponds to Python:
669/// - `models.py:validate()` for license expression validation
670/// - `index.py:_add_rules()` for duplicate detection via hash
671///
672/// Kept for backward compatibility with `load_rules_from_directory`.
673#[allow(dead_code)]
674fn validate_rules(rules: &[Rule]) {
675    let mut seen_texts: HashSet<&str> = HashSet::new();
676    let mut duplicate_count = 0;
677
678    for rule in rules {
679        if !seen_texts.insert(&rule.text) {
680            warn!(
681                "Duplicate rule text found for license_expression: {}",
682                rule.license_expression
683            );
684            duplicate_count += 1;
685        }
686
687        if !rule.is_false_positive && rule.license_expression.trim().is_empty() {
688            warn!("Rule has empty license_expression but is not marked as false_positive");
689        }
690    }
691
692    if duplicate_count > 0 {
693        warn!(
694            "Found {} duplicate rule text(s) during rule validation",
695            duplicate_count
696        );
697    }
698}
699
700/// Load all .RULE files from a directory into `Rule` values (backward-compatible).
701///
702/// This function loads rules and applies deprecated filtering during loading.
703/// For the two-stage pipeline, prefer `load_loaded_rules_from_directory` and
704/// `build_index_from_loaded`.
705///
706/// Kept for backward compatibility and testing despite not being used in production code.
707/// The new pipeline uses the two-stage loading process instead.
708#[allow(dead_code)]
709pub fn load_rules_from_directory(dir: &Path, with_deprecated: bool) -> Result<Vec<Rule>> {
710    let loaded = load_loaded_rules_from_directory(dir)?;
711    let rules: Vec<Rule> = loaded
712        .into_iter()
713        .filter(|r| with_deprecated || !r.is_deprecated)
714        .map(loaded_rule_to_rule)
715        .collect();
716    validate_rules(&rules);
717    Ok(rules)
718}
719
720/// Load all .LICENSE files from a directory into `License` values (backward-compatible).
721///
722/// This function loads licenses and applies deprecated filtering during loading.
723/// For the two-stage pipeline, prefer `load_loaded_licenses_from_directory` and
724/// `build_index_from_loaded`.
725///
726/// Kept for backward compatibility and testing despite not being used in production code.
727/// The new pipeline uses the two-stage loading process instead.
728#[allow(dead_code)]
729pub fn load_licenses_from_directory(dir: &Path, with_deprecated: bool) -> Result<Vec<License>> {
730    let loaded = load_loaded_licenses_from_directory(dir)?;
731    let licenses: Vec<License> = loaded
732        .into_iter()
733        .filter(|l| with_deprecated || !l.is_deprecated)
734        .map(loaded_license_to_license)
735        .collect();
736    Ok(licenses)
737}
738
739#[cfg(test)]
740mod tests {
741    use super::*;
742    use std::collections::HashMap;
743    use std::fs;
744    use tempfile::tempdir;
745
746    pub fn parse_rule_file(path: &Path) -> Result<Rule> {
747        let loaded = parse_rule_to_loaded(path)?;
748        Ok(loaded_rule_to_rule(loaded))
749    }
750
751    #[test]
752    fn test_parse_number_as_u8() {
753        let num_int: yaml_serde::Number = yaml_serde::from_str("100").unwrap();
754        assert_eq!(num_int.as_u8(), Some(100));
755
756        let num_out_of_range: yaml_serde::Number = yaml_serde::from_str("500").unwrap();
757        assert_eq!(num_out_of_range.as_u8(), None);
758
759        let num_float: yaml_serde::Number = yaml_serde::from_str("90.5").unwrap();
760        assert_eq!(num_float.as_u8(), Some(90));
761    }
762
763    #[test]
764    fn test_parse_simple_license_file() {
765        let dir = tempdir().unwrap();
766        let license_path = dir.path().join("mit.LICENSE");
767        fs::write(
768            &license_path,
769            r#"---
770key: mit
771short_name: MIT License
772name: MIT License
773category: Permissive
774spdx_license_key: MIT
775---
776MIT License text here"#,
777        )
778        .unwrap();
779
780        let license = parse_license_to_loaded(&license_path)
781            .map(loaded_license_to_license)
782            .unwrap();
783        assert_eq!(license.key, "mit");
784        assert_eq!(license.name, "MIT License");
785        assert!(license.text.contains("MIT License text"));
786    }
787
788    #[test]
789    fn test_parse_simple_rule_file() {
790        let dir = tempdir().unwrap();
791        let rule_path = dir.path().join("mit_1.RULE");
792        fs::write(
793            &rule_path,
794            r#"---
795license_expression: mit
796is_license_reference: yes
797relevance: 90
798referenced_filenames:
799    - MIT.txt
800---
801MIT.txt"#,
802        )
803        .unwrap();
804
805        let rule = parse_rule_file(&rule_path).unwrap();
806        assert_eq!(rule.license_expression, "mit");
807        assert_eq!(rule.text, "MIT.txt");
808        assert!(rule.is_license_reference());
809        assert_eq!(rule.relevance, 90);
810    }
811
812    #[test]
813    fn test_deserialize_yes_no_bool() {
814        let dir = tempdir().unwrap();
815        let rule_path = dir.path().join("test.RULE");
816
817        fs::write(
818            &rule_path,
819            r#"---
820license_expression: mit
821is_license_notice: yes
822is_license_tag: no
823---
824MIT License"#,
825        )
826        .unwrap();
827
828        let rule = parse_rule_file(&rule_path).unwrap();
829        assert!(rule.is_license_notice());
830        assert!(!rule.is_license_tag());
831    }
832
833    #[test]
834    fn test_load_licenses_from_directory() {
835        let dir = tempdir().unwrap();
836
837        fs::write(
838            dir.path().join("test.LICENSE"),
839            r#"---
840key: test
841name: Test License
842spdx_license_key: TEST
843category: Permissive
844---
845Test license text here"#,
846        )
847        .unwrap();
848
849        let licenses = load_licenses_from_directory(dir.path(), false).unwrap();
850        assert_eq!(licenses.len(), 1);
851
852        let license = &licenses[0];
853        assert_eq!(license.key, "test");
854        assert_eq!(license.name, "Test License");
855        assert_eq!(license.spdx_license_key, Some("TEST".to_string()));
856        assert!(!license.text.is_empty());
857    }
858
859    #[test]
860    fn test_load_rules_from_directory() {
861        let dir = tempdir().unwrap();
862
863        fs::write(
864            dir.path().join("test_1.RULE"),
865            r#"---
866license_expression: test
867is_license_reference: yes
868relevance: 85
869referenced_filenames:
870    - TEST.txt
871---
872TEST.txt"#,
873        )
874        .unwrap();
875
876        let rules = load_rules_from_directory(dir.path(), false).unwrap();
877        assert_eq!(rules.len(), 1);
878
879        let rule = &rules[0];
880        assert_eq!(rule.license_expression, "test");
881        assert!(rule.is_license_reference());
882        assert_eq!(rule.relevance, 85);
883    }
884
885    #[test]
886    fn test_validate_rules_detects_duplicates() {
887        let rules = vec![
888            Rule {
889                identifier: "mit.LICENSE".to_string(),
890                license_expression: "mit".to_string(),
891                text: "MIT License".to_string(),
892                tokens: vec![],
893                rule_kind: crate::license_detection::models::RuleKind::Text,
894                is_false_positive: false,
895                is_required_phrase: false,
896                is_from_license: false,
897                relevance: 100,
898                minimum_coverage: None,
899                has_stored_minimum_coverage: false,
900                is_continuous: false,
901                required_phrase_spans: vec![],
902                stopwords_by_pos: HashMap::new(),
903                referenced_filenames: None,
904                ignorable_urls: None,
905                ignorable_emails: None,
906                ignorable_copyrights: None,
907                ignorable_holders: None,
908                ignorable_authors: None,
909                language: None,
910                notes: None,
911                length_unique: 0,
912                high_length_unique: 0,
913                high_length: 0,
914                min_matched_length: 0,
915                min_high_matched_length: 0,
916                min_matched_length_unique: 0,
917                min_high_matched_length_unique: 0,
918                is_small: false,
919                is_tiny: false,
920                starts_with_license: false,
921                ends_with_license: false,
922                is_deprecated: false,
923                spdx_license_key: None,
924                other_spdx_license_keys: vec![],
925            },
926            Rule {
927                identifier: "apache-2.0.LICENSE".to_string(),
928                license_expression: "apache-2.0".to_string(),
929                text: "MIT License".to_string(),
930                tokens: vec![],
931                rule_kind: crate::license_detection::models::RuleKind::Text,
932                is_false_positive: false,
933                is_required_phrase: false,
934                is_from_license: false,
935                relevance: 100,
936                minimum_coverage: None,
937                has_stored_minimum_coverage: false,
938                is_continuous: false,
939                required_phrase_spans: vec![],
940                stopwords_by_pos: HashMap::new(),
941                referenced_filenames: None,
942                ignorable_urls: None,
943                ignorable_emails: None,
944                ignorable_copyrights: None,
945                ignorable_holders: None,
946                ignorable_authors: None,
947                language: None,
948                notes: None,
949                length_unique: 0,
950                high_length_unique: 0,
951                high_length: 0,
952                min_matched_length: 0,
953                min_high_matched_length: 0,
954                min_matched_length_unique: 0,
955                min_high_matched_length_unique: 0,
956                is_small: false,
957                is_tiny: false,
958                starts_with_license: false,
959                ends_with_license: false,
960                is_deprecated: false,
961                spdx_license_key: None,
962                other_spdx_license_keys: vec![],
963            },
964        ];
965
966        validate_rules(&rules);
967    }
968
969    #[test]
970    fn test_validate_rules_accepts_false_positive_without_expression() {
971        let rules = vec![Rule {
972            identifier: "fp.RULE".to_string(),
973            license_expression: "".to_string(),
974            text: "Some text".to_string(),
975            tokens: vec![],
976            rule_kind: crate::license_detection::models::RuleKind::None,
977            is_false_positive: true,
978            is_required_phrase: false,
979            is_from_license: false,
980            relevance: 100,
981            minimum_coverage: None,
982            has_stored_minimum_coverage: false,
983            is_continuous: false,
984            required_phrase_spans: vec![],
985            stopwords_by_pos: HashMap::new(),
986            referenced_filenames: None,
987            ignorable_urls: None,
988            ignorable_emails: None,
989            ignorable_copyrights: None,
990            ignorable_holders: None,
991            ignorable_authors: None,
992            language: None,
993            notes: Some("False positive for common pattern".to_string()),
994            length_unique: 0,
995            high_length_unique: 0,
996            high_length: 0,
997            min_matched_length: 0,
998            min_high_matched_length: 0,
999            min_matched_length_unique: 0,
1000            min_high_matched_length_unique: 0,
1001            is_small: false,
1002            is_tiny: false,
1003            starts_with_license: false,
1004            ends_with_license: false,
1005            is_deprecated: false,
1006            spdx_license_key: None,
1007            other_spdx_license_keys: vec![],
1008        }];
1009
1010        validate_rules(&rules);
1011    }
1012
1013    #[test]
1014    fn test_validate_rules_no_duplicates() {
1015        let rules = vec![
1016            Rule {
1017                identifier: "mit.LICENSE".to_string(),
1018                license_expression: "mit".to_string(),
1019                text: "MIT License".to_string(),
1020                tokens: vec![],
1021                rule_kind: crate::license_detection::models::RuleKind::Text,
1022                is_false_positive: false,
1023                is_required_phrase: false,
1024                is_from_license: false,
1025                relevance: 100,
1026                minimum_coverage: None,
1027                has_stored_minimum_coverage: false,
1028                is_continuous: false,
1029                required_phrase_spans: vec![],
1030                stopwords_by_pos: HashMap::new(),
1031                referenced_filenames: None,
1032                ignorable_urls: None,
1033                ignorable_emails: None,
1034                ignorable_copyrights: None,
1035                ignorable_holders: None,
1036                ignorable_authors: None,
1037                language: None,
1038                notes: None,
1039                length_unique: 0,
1040                high_length_unique: 0,
1041                high_length: 0,
1042                min_matched_length: 0,
1043                min_high_matched_length: 0,
1044                min_matched_length_unique: 0,
1045                min_high_matched_length_unique: 0,
1046                is_small: false,
1047                is_tiny: false,
1048                starts_with_license: false,
1049                ends_with_license: false,
1050                is_deprecated: false,
1051                spdx_license_key: None,
1052                other_spdx_license_keys: vec![],
1053            },
1054            Rule {
1055                identifier: "apache-2.0.LICENSE".to_string(),
1056                license_expression: "apache-2.0".to_string(),
1057                text: "Apache License".to_string(),
1058                tokens: vec![],
1059                rule_kind: crate::license_detection::models::RuleKind::Text,
1060                is_false_positive: false,
1061                is_required_phrase: false,
1062                is_from_license: false,
1063                relevance: 100,
1064                minimum_coverage: None,
1065                has_stored_minimum_coverage: false,
1066                is_continuous: false,
1067                required_phrase_spans: vec![],
1068                stopwords_by_pos: HashMap::new(),
1069                referenced_filenames: None,
1070                ignorable_urls: None,
1071                ignorable_emails: None,
1072                ignorable_copyrights: None,
1073                ignorable_holders: None,
1074                ignorable_authors: None,
1075                language: None,
1076                notes: None,
1077                length_unique: 0,
1078                high_length_unique: 0,
1079                high_length: 0,
1080                min_matched_length: 0,
1081                min_high_matched_length: 0,
1082                min_matched_length_unique: 0,
1083                min_high_matched_length_unique: 0,
1084                is_small: false,
1085                is_tiny: false,
1086                starts_with_license: false,
1087                ends_with_license: false,
1088                is_deprecated: false,
1089                spdx_license_key: None,
1090                other_spdx_license_keys: vec![],
1091            },
1092        ];
1093
1094        validate_rules(&rules);
1095    }
1096
1097    #[test]
1098    fn test_load_licenses_filters_deprecated_by_default() {
1099        let dir = tempdir().unwrap();
1100
1101        fs::write(
1102            dir.path().join("active.LICENSE"),
1103            r#"---
1104key: active
1105name: Active License
1106---
1107Active license text"#,
1108        )
1109        .unwrap();
1110
1111        fs::write(
1112            dir.path().join("deprecated.LICENSE"),
1113            r#"---
1114key: deprecated
1115name: Deprecated License
1116is_deprecated: yes
1117---
1118Deprecated license text"#,
1119        )
1120        .unwrap();
1121
1122        let licenses_without = load_licenses_from_directory(dir.path(), false).unwrap();
1123        assert_eq!(licenses_without.len(), 1);
1124        assert_eq!(licenses_without[0].key, "active");
1125
1126        let licenses_with = load_licenses_from_directory(dir.path(), true).unwrap();
1127        assert_eq!(licenses_with.len(), 2);
1128    }
1129
1130    #[test]
1131    fn test_load_rules_filters_deprecated_by_default() {
1132        let dir = tempdir().unwrap();
1133
1134        fs::write(
1135            dir.path().join("active.RULE"),
1136            r#"---
1137license_expression: active
1138is_license_notice: yes
1139---
1140Active rule text"#,
1141        )
1142        .unwrap();
1143
1144        fs::write(
1145            dir.path().join("deprecated.RULE"),
1146            r#"---
1147license_expression: deprecated
1148is_license_notice: yes
1149is_deprecated: yes
1150---
1151Deprecated rule text"#,
1152        )
1153        .unwrap();
1154
1155        let rules_without = load_rules_from_directory(dir.path(), false).unwrap();
1156        assert_eq!(rules_without.len(), 1);
1157        assert_eq!(rules_without[0].license_expression, "active");
1158
1159        let rules_with = load_rules_from_directory(dir.path(), true).unwrap();
1160        assert_eq!(rules_with.len(), 2);
1161    }
1162
1163    #[test]
1164    fn test_parse_rule_to_loaded() {
1165        let dir = tempdir().unwrap();
1166        let rule_path = dir.path().join("mit_1.RULE");
1167        fs::write(
1168            &rule_path,
1169            r#"---
1170license_expression: mit
1171is_license_reference: yes
1172relevance: 90
1173referenced_filenames:
1174    - MIT.txt
1175---
1176MIT.txt"#,
1177        )
1178        .unwrap();
1179
1180        let loaded = parse_rule_to_loaded(&rule_path).unwrap();
1181        assert_eq!(loaded.identifier, "mit_1.RULE");
1182        assert_eq!(loaded.license_expression, "mit");
1183        assert_eq!(loaded.text, "MIT.txt");
1184        assert_eq!(
1185            loaded.rule_kind,
1186            crate::license_detection::models::RuleKind::Reference
1187        );
1188        assert_eq!(loaded.relevance, Some(90));
1189        assert_eq!(
1190            loaded.referenced_filenames,
1191            Some(vec!["MIT.txt".to_string()])
1192        );
1193        assert!(!loaded.is_deprecated);
1194    }
1195
1196    #[test]
1197    fn test_parse_license_to_loaded() {
1198        let dir = tempdir().unwrap();
1199        let license_path = dir.path().join("mit.LICENSE");
1200        fs::write(
1201            &license_path,
1202            r#"---
1203key: mit
1204short_name: MIT License
1205name: MIT License
1206category: Permissive
1207spdx_license_key: MIT
1208---
1209MIT License text here"#,
1210        )
1211        .unwrap();
1212
1213        let loaded = parse_license_to_loaded(&license_path).unwrap();
1214        assert_eq!(loaded.key, "mit");
1215        assert_eq!(loaded.name, "MIT License");
1216        assert!(loaded.text.contains("MIT License text"));
1217        assert_eq!(loaded.spdx_license_key, Some("MIT".to_string()));
1218    }
1219
1220    #[test]
1221    fn test_load_loaded_rules_from_directory_includes_deprecated() {
1222        let dir = tempdir().unwrap();
1223
1224        fs::write(
1225            dir.path().join("active.RULE"),
1226            r#"---
1227license_expression: active
1228is_license_notice: yes
1229---
1230Active rule text"#,
1231        )
1232        .unwrap();
1233
1234        fs::write(
1235            dir.path().join("deprecated.RULE"),
1236            r#"---
1237license_expression: deprecated
1238is_license_notice: yes
1239is_deprecated: yes
1240---
1241Deprecated rule text"#,
1242        )
1243        .unwrap();
1244
1245        let loaded_rules = load_loaded_rules_from_directory(dir.path()).unwrap();
1246        assert_eq!(loaded_rules.len(), 2);
1247
1248        let active = loaded_rules
1249            .iter()
1250            .find(|r| r.license_expression == "active")
1251            .unwrap();
1252        assert!(!active.is_deprecated);
1253
1254        let deprecated = loaded_rules
1255            .iter()
1256            .find(|r| r.license_expression == "deprecated")
1257            .unwrap();
1258        assert!(deprecated.is_deprecated);
1259    }
1260
1261    #[test]
1262    fn test_load_loaded_licenses_from_directory_includes_deprecated() {
1263        let dir = tempdir().unwrap();
1264
1265        fs::write(
1266            dir.path().join("active.LICENSE"),
1267            r#"---
1268key: active
1269name: Active License
1270---
1271Active license text"#,
1272        )
1273        .unwrap();
1274
1275        fs::write(
1276            dir.path().join("deprecated.LICENSE"),
1277            r#"---
1278key: deprecated
1279name: Deprecated License
1280is_deprecated: yes
1281---
1282Deprecated license text"#,
1283        )
1284        .unwrap();
1285
1286        let loaded_licenses = load_loaded_licenses_from_directory(dir.path()).unwrap();
1287        assert_eq!(loaded_licenses.len(), 2);
1288
1289        let active = loaded_licenses.iter().find(|l| l.key == "active").unwrap();
1290        assert!(!active.is_deprecated);
1291
1292        let deprecated = loaded_licenses
1293            .iter()
1294            .find(|l| l.key == "deprecated")
1295            .unwrap();
1296        assert!(deprecated.is_deprecated);
1297    }
1298}