Skip to main content

provenant/license_detection/rules/
loader.rs

1//! Parse .LICENSE and .RULE files.
2//!
3//! This module provides two-stage loading:
4//! 1. Loader-stage: Parse files into `LoadedRule` and `LoadedLicense`
5//! 2. Build-stage: Convert to runtime `Rule` and `License` (deprecated filtering, etc.)
6//!
7//! The loader-stage functions (`parse_rule_to_loaded`, `parse_license_to_loaded`,
8//! `load_loaded_rules_from_directory`, `load_loaded_licenses_from_directory`) return
9//! all entries including deprecated ones. Deprecated filtering is a build-stage concern.
10
11use crate::license_detection::index::{loaded_license_to_license, loaded_rule_to_rule};
12use crate::license_detection::models::{License, LoadedLicense, LoadedRule, Rule};
13use anyhow::{Context, Result, anyhow};
14use log::warn;
15use regex::Regex;
16use serde::{Deserialize, Deserializer, Serialize};
17use std::collections::HashSet;
18use std::fs;
19use std::path::Path;
20use std::sync::LazyLock;
21
22static FM_BOUNDARY: LazyLock<Regex> =
23    LazyLock::new(|| Regex::new(r"(?m)^-{3,}\s*$").expect("Invalid frontmatter regex"));
24
25fn deserialize_yes_no_bool<'de, D>(deserializer: D) -> Result<Option<bool>, D::Error>
26where
27    D: Deserializer<'de>,
28{
29    #[derive(Deserialize, Serialize)]
30    #[serde(untagged)]
31    enum YesNoOrBool {
32        String(String),
33        Bool(bool),
34    }
35
36    match YesNoOrBool::deserialize(deserializer)? {
37        YesNoOrBool::Bool(b) => Ok(Some(b)),
38        YesNoOrBool::String(s) => {
39            let lower = s.to_lowercase();
40            if lower == "yes" || lower == "true" || lower == "1" {
41                Ok(Some(true))
42            } else if lower == "no" || lower == "false" || lower == "0" {
43                Ok(Some(false))
44            } else {
45                Ok(None)
46            }
47        }
48    }
49}
50
51trait ParseNumber {
52    fn as_u8(&self) -> Option<u8>;
53}
54
55impl ParseNumber for yaml_serde::Number {
56    fn as_u8(&self) -> Option<u8> {
57        self.as_i64()
58            .and_then(|n| u8::try_from(n).ok())
59            .or_else(|| {
60                self.as_f64().and_then(|f| {
61                    if f >= 0.0 && f <= f64::from(u8::MAX) {
62                        // truncation toward zero is intentional (e.g. 90.5 → 90)
63                        #[allow(clippy::cast_sign_loss)]
64                        Some(f as u8)
65                    } else {
66                        None
67                    }
68                })
69            })
70    }
71}
72
73#[derive(Debug, Deserialize)]
74#[allow(dead_code)]
75struct LicenseFrontmatter {
76    #[serde(default)]
77    key: Option<String>,
78
79    #[serde(default)]
80    short_name: Option<String>,
81
82    #[serde(default)]
83    name: Option<String>,
84
85    #[serde(default)]
86    category: Option<String>,
87
88    #[serde(default)]
89    owner: Option<String>,
90
91    #[serde(default)]
92    homepage_url: Option<String>,
93
94    #[serde(default)]
95    notes: Option<String>,
96
97    #[serde(default)]
98    spdx_license_key: Option<String>,
99
100    #[serde(default)]
101    other_spdx_license_keys: Option<Vec<String>>,
102
103    #[serde(default)]
104    osi_license_key: Option<String>,
105
106    #[serde(default)]
107    text_urls: Option<Vec<String>>,
108
109    #[serde(default)]
110    osi_url: Option<String>,
111
112    #[serde(default)]
113    faq_url: Option<String>,
114
115    #[serde(default)]
116    other_urls: Option<Vec<String>>,
117
118    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
119    is_deprecated: Option<bool>,
120
121    #[serde(default)]
122    replaced_by: Option<Vec<String>>,
123
124    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
125    is_exception: Option<bool>,
126
127    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
128    is_unknown: Option<bool>,
129
130    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
131    is_generic: Option<bool>,
132
133    #[serde(default)]
134    minimum_coverage: Option<yaml_serde::Number>,
135
136    #[serde(default)]
137    standard_notice: Option<String>,
138
139    #[serde(default)]
140    ignorable_copyrights: Option<Vec<String>>,
141
142    #[serde(default)]
143    ignorable_holders: Option<Vec<String>>,
144
145    #[serde(default)]
146    ignorable_authors: Option<Vec<String>>,
147
148    #[serde(default)]
149    ignorable_urls: Option<Vec<String>>,
150
151    #[serde(default)]
152    ignorable_emails: Option<Vec<String>>,
153}
154
155/// Parsed rule file content, split into frontmatter and text.
156struct ParsedRuleFile {
157    yaml_content: String,
158    text_content: String,
159    has_stored_minimum_coverage: bool,
160}
161
162/// Parsed license file content, split into frontmatter and text.
163struct ParsedLicenseFile {
164    yaml_content: String,
165    text_content: String,
166}
167
168/// Parse file content into frontmatter and text sections.
169///
170/// Returns `ParsedRuleFile` with yaml_content, text_content, and metadata.
171/// The `path` parameter is used for error messages only.
172fn parse_file_content(content: &str, path: &Path) -> Result<ParsedRuleFile> {
173    if content.len() < 6 {
174        return Err(anyhow!("File content too short: {}", path.display()));
175    }
176
177    let parts: Vec<&str> = FM_BOUNDARY.splitn(content, 3).collect();
178
179    if parts.len() < 3 {
180        let trimmed = content.trim();
181        if trimmed.is_empty() {
182            return Err(anyhow!(
183                "File is empty or has no content: {}",
184                path.display()
185            ));
186        }
187        return Err(anyhow!("File missing delimiter '---': {}", path.display()));
188    }
189
190    let yaml_content = parts
191        .get(1)
192        .ok_or_else(|| anyhow!("Missing YAML frontmatter in {}", path.display()))?
193        .to_string();
194    let text_content = parts
195        .get(2)
196        .ok_or_else(|| {
197            anyhow!(
198                "Missing text content after frontmatter in {}",
199                path.display()
200            )
201        })?
202        .trim_start_matches('\n')
203        .trim()
204        .to_string();
205
206    let frontmatter_value: yaml_serde::Value =
207        yaml_serde::from_str(&yaml_content).map_err(|e| {
208            anyhow!(
209                "Failed to parse frontmatter YAML in {}: {}\nContent was:\n{}",
210                path.display(),
211                e,
212                yaml_content
213            )
214        })?;
215
216    let has_stored_minimum_coverage = frontmatter_value.as_mapping().is_some_and(|mapping| {
217        mapping.contains_key(yaml_serde::Value::String("minimum_coverage".to_string()))
218    });
219
220    Ok(ParsedRuleFile {
221        yaml_content,
222        text_content,
223        has_stored_minimum_coverage,
224    })
225}
226
227#[derive(Debug, Deserialize)]
228#[allow(dead_code)]
229struct RuleFrontmatter {
230    #[serde(default)]
231    license_expression: Option<String>,
232
233    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
234    is_license_text: Option<bool>,
235
236    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
237    is_license_notice: Option<bool>,
238
239    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
240    is_license_reference: Option<bool>,
241
242    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
243    is_license_tag: Option<bool>,
244
245    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
246    is_license_intro: Option<bool>,
247
248    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
249    is_license_clue: Option<bool>,
250
251    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
252    is_false_positive: Option<bool>,
253
254    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
255    is_required_phrase: Option<bool>,
256
257    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
258    skip_for_required_phrase_generation: Option<bool>,
259
260    #[serde(default)]
261    relevance: Option<yaml_serde::Number>,
262
263    #[serde(default)]
264    minimum_coverage: Option<yaml_serde::Number>,
265
266    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
267    is_continuous: Option<bool>,
268
269    #[serde(default, deserialize_with = "deserialize_yes_no_bool")]
270    is_deprecated: Option<bool>,
271
272    #[serde(default)]
273    referenced_filenames: Option<Vec<String>>,
274
275    #[serde(default)]
276    replaced_by: Option<Vec<String>>,
277
278    #[serde(default)]
279    ignorable_urls: Option<Vec<String>>,
280
281    #[serde(default)]
282    ignorable_emails: Option<Vec<String>>,
283
284    #[serde(default)]
285    notes: Option<String>,
286
287    #[serde(default)]
288    ignorable_copyrights: Option<Vec<String>>,
289
290    #[serde(default)]
291    ignorable_holders: Option<Vec<String>>,
292
293    #[serde(default)]
294    ignorable_authors: Option<Vec<String>>,
295
296    #[serde(default)]
297    language: Option<String>,
298}
299
300fn parse_rule_source_to_loaded(
301    identifier: &str,
302    content: &str,
303    source_path: &Path,
304) -> Result<LoadedRule> {
305    let identifier = LoadedRule::derive_identifier(
306        source_path
307            .file_name()
308            .and_then(|s| s.to_str())
309            .unwrap_or(identifier),
310    );
311
312    let parsed = parse_file_content(content, source_path)?;
313
314    if parsed.text_content.is_empty() {
315        return Err(anyhow!(
316            "Rule file has empty text content: {}",
317            source_path.display()
318        ));
319    }
320
321    let fm: RuleFrontmatter = yaml_serde::from_str(&parsed.yaml_content).map_err(|e| {
322        anyhow!(
323            "Failed to parse rule frontmatter YAML in {}: {}\nContent was:\n{}",
324            source_path.display(),
325            e,
326            parsed.yaml_content
327        )
328    })?;
329
330    let is_false_positive = fm.is_false_positive.unwrap_or(false);
331
332    let rule_kind = LoadedRule::derive_rule_kind(
333        fm.is_license_text.unwrap_or(false),
334        fm.is_license_notice.unwrap_or(false),
335        fm.is_license_reference.unwrap_or(false),
336        fm.is_license_tag.unwrap_or(false),
337        fm.is_license_intro.unwrap_or(false),
338        fm.is_license_clue.unwrap_or(false),
339    )
340    .map_err(|e| {
341        anyhow!(
342            "Rule file has invalid rule-kind flags: {}: {}",
343            source_path.display(),
344            e
345        )
346    })?;
347
348    LoadedRule::validate_rule_kind_flags(rule_kind, is_false_positive).map_err(|e| {
349        anyhow!(
350            "Rule file has invalid flags: {}: {}",
351            source_path.display(),
352            e
353        )
354    })?;
355
356    let license_expression = LoadedRule::normalize_license_expression(
357        fm.license_expression.as_deref(),
358        is_false_positive,
359    )
360    .map_err(|e| {
361        anyhow!(
362            "Rule file has invalid license_expression: {}: {}",
363            source_path.display(),
364            e
365        )
366    })?;
367
368    let relevance = fm.relevance.and_then(|n| n.as_u8());
369
370    let minimum_coverage = fm.minimum_coverage.and_then(|n| n.as_u8());
371
372    Ok(LoadedRule {
373        identifier,
374        license_expression,
375        text: parsed.text_content,
376        rule_kind,
377        is_false_positive,
378        is_required_phrase: fm.is_required_phrase.unwrap_or(false),
379        skip_for_required_phrase_generation: fm
380            .skip_for_required_phrase_generation
381            .unwrap_or(false),
382        relevance,
383        minimum_coverage,
384        has_stored_minimum_coverage: parsed.has_stored_minimum_coverage,
385        is_continuous: fm.is_continuous.unwrap_or(false),
386        referenced_filenames: LoadedRule::normalize_optional_list(
387            fm.referenced_filenames.as_deref(),
388        ),
389        ignorable_urls: LoadedRule::normalize_optional_list(fm.ignorable_urls.as_deref()),
390        ignorable_emails: LoadedRule::normalize_optional_list(fm.ignorable_emails.as_deref()),
391        ignorable_copyrights: LoadedRule::normalize_optional_list(
392            fm.ignorable_copyrights.as_deref(),
393        ),
394        ignorable_holders: LoadedRule::normalize_optional_list(fm.ignorable_holders.as_deref()),
395        ignorable_authors: LoadedRule::normalize_optional_list(fm.ignorable_authors.as_deref()),
396        language: LoadedRule::normalize_optional_string(fm.language.as_deref()),
397        notes: LoadedRule::normalize_optional_string(fm.notes.as_deref()),
398        is_deprecated: fm.is_deprecated.unwrap_or(false),
399        replaced_by: fm.replaced_by.unwrap_or_default(),
400    })
401}
402
403/// Parse a .RULE file into a `LoadedRule` (loader-stage).
404///
405/// This function parses the file and returns a `LoadedRule` with normalized data.
406/// Deprecated entries are included - filtering is a build-stage concern.
407pub fn parse_rule_to_loaded(path: &Path) -> Result<LoadedRule> {
408    let content = fs::read_to_string(path)
409        .with_context(|| format!("Failed to read rule file: {}", path.display()))?;
410    parse_rule_source_to_loaded(
411        path.file_name()
412            .and_then(|s| s.to_str())
413            .unwrap_or("unknown.RULE"),
414        &content,
415        path,
416    )
417}
418
419/// Parse a rule from in-memory ScanCode-style `.RULE` content.
420pub fn parse_rule_str_to_loaded(identifier: &str, content: &str) -> Result<LoadedRule> {
421    let synthetic_path = Path::new(identifier);
422    parse_rule_source_to_loaded(identifier, content, synthetic_path)
423}
424
425fn parse_license_source_to_loaded(
426    filename: &str,
427    content: &str,
428    source_path: &Path,
429) -> Result<LoadedLicense> {
430    let key = LoadedLicense::derive_key(Path::new(filename))?;
431
432    let parsed = parse_license_file_content(content, source_path)?;
433
434    let fm: LicenseFrontmatter = yaml_serde::from_str(&parsed.yaml_content).map_err(|e| {
435        anyhow!(
436            "Failed to parse license frontmatter YAML in {}: {}\nContent was:\n{}",
437            source_path.display(),
438            e,
439            parsed.yaml_content
440        )
441    })?;
442
443    LoadedLicense::validate_key_match(&key, fm.key.as_deref()).map_err(|e| {
444        anyhow!(
445            "License file has key mismatch: {}: {}",
446            source_path.display(),
447            e
448        )
449    })?;
450
451    let is_deprecated = fm.is_deprecated.unwrap_or(false);
452    let is_unknown = fm.is_unknown.unwrap_or(false);
453    let is_generic = fm.is_generic.unwrap_or(false);
454
455    LoadedLicense::validate_text_content(
456        &parsed.text_content,
457        is_deprecated,
458        is_unknown,
459        is_generic,
460    )
461    .map_err(|e| {
462        anyhow!(
463            "License file has invalid content: {}: {}",
464            source_path.display(),
465            e
466        )
467    })?;
468
469    let name = LoadedLicense::derive_name(fm.name.as_deref(), fm.short_name.as_deref(), &key);
470
471    let reference_urls = LoadedLicense::merge_reference_urls(
472        fm.text_urls.as_deref(),
473        fm.other_urls.as_deref(),
474        fm.osi_url.as_deref(),
475        fm.faq_url.as_deref(),
476        fm.homepage_url.as_deref(),
477    );
478
479    let minimum_coverage = fm.minimum_coverage.and_then(|n| n.as_u8());
480
481    Ok(LoadedLicense {
482        key,
483        short_name: LoadedLicense::normalize_optional_string(fm.short_name.as_deref()),
484        name,
485        language: Some("en".to_string()),
486        spdx_license_key: LoadedLicense::normalize_optional_string(fm.spdx_license_key.as_deref()),
487        other_spdx_license_keys: fm.other_spdx_license_keys.unwrap_or_default(),
488        category: LoadedLicense::normalize_optional_string(fm.category.as_deref()),
489        owner: LoadedLicense::normalize_optional_string(fm.owner.as_deref()),
490        homepage_url: LoadedLicense::normalize_optional_string(fm.homepage_url.as_deref()),
491        text: parsed.text_content,
492        reference_urls,
493        osi_license_key: LoadedLicense::normalize_optional_string(fm.osi_license_key.as_deref()),
494        text_urls: LoadedLicense::normalize_optional_list(fm.text_urls.as_deref())
495            .unwrap_or_default(),
496        osi_url: LoadedLicense::normalize_optional_string(fm.osi_url.as_deref()),
497        faq_url: LoadedLicense::normalize_optional_string(fm.faq_url.as_deref()),
498        other_urls: LoadedLicense::normalize_optional_list(fm.other_urls.as_deref())
499            .unwrap_or_default(),
500        notes: LoadedLicense::normalize_optional_string(fm.notes.as_deref()),
501        is_deprecated,
502        is_exception: fm.is_exception.unwrap_or(false),
503        is_unknown,
504        is_generic,
505        replaced_by: fm.replaced_by.unwrap_or_default(),
506        minimum_coverage,
507        standard_notice: LoadedLicense::normalize_optional_string(fm.standard_notice.as_deref()),
508        ignorable_copyrights: LoadedLicense::normalize_optional_list(
509            fm.ignorable_copyrights.as_deref(),
510        ),
511        ignorable_holders: LoadedLicense::normalize_optional_list(fm.ignorable_holders.as_deref()),
512        ignorable_authors: LoadedLicense::normalize_optional_list(fm.ignorable_authors.as_deref()),
513        ignorable_urls: LoadedLicense::normalize_optional_list(fm.ignorable_urls.as_deref()),
514        ignorable_emails: LoadedLicense::normalize_optional_list(fm.ignorable_emails.as_deref()),
515    })
516}
517
518/// Parse a .LICENSE file into a `LoadedLicense` (loader-stage).
519///
520/// This function parses the file and returns a `LoadedLicense` with normalized data.
521/// Deprecated entries are included - filtering is a build-stage concern.
522pub fn parse_license_to_loaded(path: &Path) -> Result<LoadedLicense> {
523    let content = fs::read_to_string(path)
524        .with_context(|| format!("Failed to read license file: {}", path.display()))?;
525    parse_license_source_to_loaded(
526        path.file_name()
527            .and_then(|s| s.to_str())
528            .unwrap_or("unknown.LICENSE"),
529        &content,
530        path,
531    )
532}
533
534/// Parse a license from in-memory ScanCode-style `.LICENSE` content.
535pub fn parse_license_str_to_loaded(filename: &str, content: &str) -> Result<LoadedLicense> {
536    let synthetic_path = Path::new(filename);
537    parse_license_source_to_loaded(filename, content, synthetic_path)
538}
539
540/// Parse license file content into frontmatter and text sections.
541///
542/// The `path` parameter is used for error messages only.
543fn parse_license_file_content(content: &str, path: &Path) -> Result<ParsedLicenseFile> {
544    if content.len() < 6 {
545        return Err(anyhow!(
546            "License file content too short: {}",
547            path.display()
548        ));
549    }
550
551    let parts: Vec<&str> = FM_BOUNDARY.splitn(content, 3).collect();
552
553    if parts.len() < 3 {
554        let trimmed = content.trim();
555        if trimmed.is_empty() {
556            return Err(anyhow!(
557                "License file is empty or has no content: {}",
558                path.display()
559            ));
560        }
561        return Err(anyhow!(
562            "License file missing delimiter '---': {}",
563            path.display()
564        ));
565    }
566
567    let yaml_content = parts
568        .get(1)
569        .ok_or_else(|| anyhow!("Missing YAML frontmatter in {}", path.display()))?
570        .to_string();
571    let text_content = parts
572        .get(2)
573        .ok_or_else(|| {
574            anyhow!(
575                "Missing text content after frontmatter in {}",
576                path.display()
577            )
578        })?
579        .trim_start_matches('\n')
580        .trim()
581        .to_string();
582
583    Ok(ParsedLicenseFile {
584        yaml_content,
585        text_content,
586    })
587}
588
589/// Load all .RULE files from a directory into `LoadedRule` values (loader-stage).
590///
591/// This function loads ALL rules, including deprecated ones.
592/// Deprecated filtering is a build-stage concern.
593///
594/// # Arguments
595/// * `dir` - Directory containing .RULE files
596///
597/// # Returns
598/// * `Ok(Vec<LoadedRule>)` - All loaded rules (including deprecated)
599/// * `Err(...)` - Directory read error
600pub fn load_loaded_rules_from_directory(dir: &Path) -> Result<Vec<LoadedRule>> {
601    let mut rules = Vec::new();
602
603    let entries = fs::read_dir(dir)
604        .with_context(|| format!("Failed to read rules directory: {}", dir.display()))?;
605
606    for entry in entries {
607        let entry = entry
608            .with_context(|| format!("Failed to read directory entry in: {}", dir.display()))?;
609        let path = entry.path();
610
611        if path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("RULE") {
612            match parse_rule_to_loaded(&path) {
613                Ok(rule) => rules.push(rule),
614                Err(e) => {
615                    warn!("Failed to parse rule file {}: {}", path.display(), e);
616                }
617            }
618        }
619    }
620
621    Ok(rules)
622}
623
624/// Load all .LICENSE files from a directory into `LoadedLicense` values (loader-stage).
625///
626/// This function loads ALL licenses, including deprecated ones.
627/// Deprecated filtering is a build-stage concern.
628///
629/// # Arguments
630/// * `dir` - Directory containing .LICENSE files
631///
632/// # Returns
633/// * `Ok(Vec<LoadedLicense>)` - All loaded licenses (including deprecated)
634/// * `Err(...)` - Directory read error
635pub fn load_loaded_licenses_from_directory(dir: &Path) -> Result<Vec<LoadedLicense>> {
636    let mut licenses = Vec::new();
637
638    let entries = fs::read_dir(dir)
639        .with_context(|| format!("Failed to read licenses directory: {}", dir.display()))?;
640
641    for entry in entries {
642        let entry = entry
643            .with_context(|| format!("Failed to read directory entry in: {}", dir.display()))?;
644        let path = entry.path();
645
646        if path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("LICENSE") {
647            match parse_license_to_loaded(&path) {
648                Ok(license) => licenses.push(license),
649                Err(e) => {
650                    warn!("Failed to parse license file {}: {}", path.display(), e);
651                }
652            }
653        }
654    }
655
656    Ok(licenses)
657}
658
659/// Validate loaded rules for common issues.
660///
661/// Checks for:
662/// 1. Duplicate rule texts (warns if found)
663/// 2. Empty license expressions for non-false-positive rules (warns if found)
664///
665/// Corresponds to Python:
666/// - `models.py:validate()` for license expression validation
667/// - `index.py:_add_rules()` for duplicate detection via hash
668///
669/// Kept for backward compatibility with `load_rules_from_directory`.
670#[allow(dead_code)]
671fn validate_rules(rules: &[Rule]) {
672    let mut seen_texts: HashSet<&str> = HashSet::new();
673    let mut duplicate_count = 0;
674
675    for rule in rules {
676        if !seen_texts.insert(&rule.text) {
677            warn!(
678                "Duplicate rule text found for license_expression: {}",
679                rule.license_expression
680            );
681            duplicate_count += 1;
682        }
683
684        if !rule.is_false_positive && rule.license_expression.trim().is_empty() {
685            warn!("Rule has empty license_expression but is not marked as false_positive");
686        }
687    }
688
689    if duplicate_count > 0 {
690        warn!(
691            "Found {} duplicate rule text(s) during rule validation",
692            duplicate_count
693        );
694    }
695}
696
697/// Load all .RULE files from a directory into `Rule` values (backward-compatible).
698///
699/// This function loads rules and applies deprecated filtering during loading.
700/// For the two-stage pipeline, prefer `load_loaded_rules_from_directory` and
701/// `build_index_from_loaded`.
702///
703/// Kept for backward compatibility and testing despite not being used in production code.
704/// The new pipeline uses the two-stage loading process instead.
705#[allow(dead_code)]
706pub fn load_rules_from_directory(dir: &Path, with_deprecated: bool) -> Result<Vec<Rule>> {
707    let loaded = load_loaded_rules_from_directory(dir)?;
708    let rules: Vec<Rule> = loaded
709        .into_iter()
710        .filter(|r| with_deprecated || !r.is_deprecated)
711        .map(loaded_rule_to_rule)
712        .collect();
713    validate_rules(&rules);
714    Ok(rules)
715}
716
717/// Load all .LICENSE files from a directory into `License` values (backward-compatible).
718///
719/// This function loads licenses and applies deprecated filtering during loading.
720/// For the two-stage pipeline, prefer `load_loaded_licenses_from_directory` and
721/// `build_index_from_loaded`.
722///
723/// Kept for backward compatibility and testing despite not being used in production code.
724/// The new pipeline uses the two-stage loading process instead.
725#[allow(dead_code)]
726pub fn load_licenses_from_directory(dir: &Path, with_deprecated: bool) -> Result<Vec<License>> {
727    let loaded = load_loaded_licenses_from_directory(dir)?;
728    let licenses: Vec<License> = loaded
729        .into_iter()
730        .filter(|l| with_deprecated || !l.is_deprecated)
731        .map(loaded_license_to_license)
732        .collect();
733    Ok(licenses)
734}
735
736#[cfg(test)]
737mod tests {
738    use super::*;
739    use std::collections::HashMap;
740    use std::fs;
741    use tempfile::tempdir;
742
743    pub fn parse_rule_file(path: &Path) -> Result<Rule> {
744        let loaded = parse_rule_to_loaded(path)?;
745        Ok(loaded_rule_to_rule(loaded))
746    }
747
748    #[test]
749    fn test_parse_number_as_u8() {
750        let num_int: yaml_serde::Number = yaml_serde::from_str("100").unwrap();
751        assert_eq!(num_int.as_u8(), Some(100));
752
753        let num_out_of_range: yaml_serde::Number = yaml_serde::from_str("500").unwrap();
754        assert_eq!(num_out_of_range.as_u8(), None);
755
756        let num_float: yaml_serde::Number = yaml_serde::from_str("90.5").unwrap();
757        assert_eq!(num_float.as_u8(), Some(90));
758    }
759
760    #[test]
761    fn test_parse_simple_license_file() {
762        let dir = tempdir().unwrap();
763        let license_path = dir.path().join("mit.LICENSE");
764        fs::write(
765            &license_path,
766            r#"---
767key: mit
768short_name: MIT License
769name: MIT License
770category: Permissive
771spdx_license_key: MIT
772---
773MIT License text here"#,
774        )
775        .unwrap();
776
777        let license = parse_license_to_loaded(&license_path)
778            .map(loaded_license_to_license)
779            .unwrap();
780        assert_eq!(license.key, "mit");
781        assert_eq!(license.name, "MIT License");
782        assert!(license.text.contains("MIT License text"));
783    }
784
785    #[test]
786    fn test_parse_simple_rule_file() {
787        let dir = tempdir().unwrap();
788        let rule_path = dir.path().join("mit_1.RULE");
789        fs::write(
790            &rule_path,
791            r#"---
792license_expression: mit
793is_license_reference: yes
794relevance: 90
795referenced_filenames:
796    - MIT.txt
797---
798MIT.txt"#,
799        )
800        .unwrap();
801
802        let rule = parse_rule_file(&rule_path).unwrap();
803        assert_eq!(rule.license_expression, "mit");
804        assert_eq!(rule.text, "MIT.txt");
805        assert!(rule.is_license_reference());
806        assert_eq!(rule.relevance, 90);
807    }
808
809    #[test]
810    fn test_deserialize_yes_no_bool() {
811        let dir = tempdir().unwrap();
812        let rule_path = dir.path().join("test.RULE");
813
814        fs::write(
815            &rule_path,
816            r#"---
817license_expression: mit
818is_license_notice: yes
819is_license_tag: no
820---
821MIT License"#,
822        )
823        .unwrap();
824
825        let rule = parse_rule_file(&rule_path).unwrap();
826        assert!(rule.is_license_notice());
827        assert!(!rule.is_license_tag());
828    }
829
830    #[test]
831    fn test_load_licenses_from_directory() {
832        let dir = tempdir().unwrap();
833
834        fs::write(
835            dir.path().join("test.LICENSE"),
836            r#"---
837key: test
838name: Test License
839spdx_license_key: TEST
840category: Permissive
841---
842Test license text here"#,
843        )
844        .unwrap();
845
846        let licenses = load_licenses_from_directory(dir.path(), false).unwrap();
847        assert_eq!(licenses.len(), 1);
848
849        let license = &licenses[0];
850        assert_eq!(license.key, "test");
851        assert_eq!(license.name, "Test License");
852        assert_eq!(license.spdx_license_key, Some("TEST".to_string()));
853        assert!(!license.text.is_empty());
854    }
855
856    #[test]
857    fn test_load_rules_from_directory() {
858        let dir = tempdir().unwrap();
859
860        fs::write(
861            dir.path().join("test_1.RULE"),
862            r#"---
863license_expression: test
864is_license_reference: yes
865relevance: 85
866referenced_filenames:
867    - TEST.txt
868---
869TEST.txt"#,
870        )
871        .unwrap();
872
873        let rules = load_rules_from_directory(dir.path(), false).unwrap();
874        assert_eq!(rules.len(), 1);
875
876        let rule = &rules[0];
877        assert_eq!(rule.license_expression, "test");
878        assert!(rule.is_license_reference());
879        assert_eq!(rule.relevance, 85);
880    }
881
882    #[test]
883    fn test_validate_rules_detects_duplicates() {
884        let rules = vec![
885            Rule {
886                identifier: "mit.LICENSE".to_string(),
887                license_expression: "mit".to_string(),
888                text: "MIT License".to_string(),
889                tokens: vec![],
890                rule_kind: crate::license_detection::models::RuleKind::Text,
891                is_false_positive: false,
892                is_required_phrase: false,
893                is_from_license: false,
894                relevance: 100,
895                minimum_coverage: None,
896                has_stored_minimum_coverage: false,
897                is_continuous: false,
898                required_phrase_spans: vec![],
899                stopwords_by_pos: HashMap::new(),
900                referenced_filenames: None,
901                ignorable_urls: None,
902                ignorable_emails: None,
903                ignorable_copyrights: None,
904                ignorable_holders: None,
905                ignorable_authors: None,
906                language: None,
907                notes: None,
908                length_unique: 0,
909                high_length_unique: 0,
910                high_length: 0,
911                min_matched_length: 0,
912                min_high_matched_length: 0,
913                min_matched_length_unique: 0,
914                min_high_matched_length_unique: 0,
915                is_small: false,
916                is_tiny: false,
917                starts_with_license: false,
918                ends_with_license: false,
919                is_deprecated: false,
920                spdx_license_key: None,
921                other_spdx_license_keys: vec![],
922            },
923            Rule {
924                identifier: "apache-2.0.LICENSE".to_string(),
925                license_expression: "apache-2.0".to_string(),
926                text: "MIT License".to_string(),
927                tokens: vec![],
928                rule_kind: crate::license_detection::models::RuleKind::Text,
929                is_false_positive: false,
930                is_required_phrase: false,
931                is_from_license: false,
932                relevance: 100,
933                minimum_coverage: None,
934                has_stored_minimum_coverage: false,
935                is_continuous: false,
936                required_phrase_spans: vec![],
937                stopwords_by_pos: HashMap::new(),
938                referenced_filenames: None,
939                ignorable_urls: None,
940                ignorable_emails: None,
941                ignorable_copyrights: None,
942                ignorable_holders: None,
943                ignorable_authors: None,
944                language: None,
945                notes: None,
946                length_unique: 0,
947                high_length_unique: 0,
948                high_length: 0,
949                min_matched_length: 0,
950                min_high_matched_length: 0,
951                min_matched_length_unique: 0,
952                min_high_matched_length_unique: 0,
953                is_small: false,
954                is_tiny: false,
955                starts_with_license: false,
956                ends_with_license: false,
957                is_deprecated: false,
958                spdx_license_key: None,
959                other_spdx_license_keys: vec![],
960            },
961        ];
962
963        validate_rules(&rules);
964    }
965
966    #[test]
967    fn test_validate_rules_accepts_false_positive_without_expression() {
968        let rules = vec![Rule {
969            identifier: "fp.RULE".to_string(),
970            license_expression: "".to_string(),
971            text: "Some text".to_string(),
972            tokens: vec![],
973            rule_kind: crate::license_detection::models::RuleKind::None,
974            is_false_positive: true,
975            is_required_phrase: false,
976            is_from_license: false,
977            relevance: 100,
978            minimum_coverage: None,
979            has_stored_minimum_coverage: false,
980            is_continuous: false,
981            required_phrase_spans: vec![],
982            stopwords_by_pos: HashMap::new(),
983            referenced_filenames: None,
984            ignorable_urls: None,
985            ignorable_emails: None,
986            ignorable_copyrights: None,
987            ignorable_holders: None,
988            ignorable_authors: None,
989            language: None,
990            notes: Some("False positive for common pattern".to_string()),
991            length_unique: 0,
992            high_length_unique: 0,
993            high_length: 0,
994            min_matched_length: 0,
995            min_high_matched_length: 0,
996            min_matched_length_unique: 0,
997            min_high_matched_length_unique: 0,
998            is_small: false,
999            is_tiny: false,
1000            starts_with_license: false,
1001            ends_with_license: false,
1002            is_deprecated: false,
1003            spdx_license_key: None,
1004            other_spdx_license_keys: vec![],
1005        }];
1006
1007        validate_rules(&rules);
1008    }
1009
1010    #[test]
1011    fn test_validate_rules_no_duplicates() {
1012        let rules = vec![
1013            Rule {
1014                identifier: "mit.LICENSE".to_string(),
1015                license_expression: "mit".to_string(),
1016                text: "MIT License".to_string(),
1017                tokens: vec![],
1018                rule_kind: crate::license_detection::models::RuleKind::Text,
1019                is_false_positive: false,
1020                is_required_phrase: false,
1021                is_from_license: false,
1022                relevance: 100,
1023                minimum_coverage: None,
1024                has_stored_minimum_coverage: false,
1025                is_continuous: false,
1026                required_phrase_spans: vec![],
1027                stopwords_by_pos: HashMap::new(),
1028                referenced_filenames: None,
1029                ignorable_urls: None,
1030                ignorable_emails: None,
1031                ignorable_copyrights: None,
1032                ignorable_holders: None,
1033                ignorable_authors: None,
1034                language: None,
1035                notes: None,
1036                length_unique: 0,
1037                high_length_unique: 0,
1038                high_length: 0,
1039                min_matched_length: 0,
1040                min_high_matched_length: 0,
1041                min_matched_length_unique: 0,
1042                min_high_matched_length_unique: 0,
1043                is_small: false,
1044                is_tiny: false,
1045                starts_with_license: false,
1046                ends_with_license: false,
1047                is_deprecated: false,
1048                spdx_license_key: None,
1049                other_spdx_license_keys: vec![],
1050            },
1051            Rule {
1052                identifier: "apache-2.0.LICENSE".to_string(),
1053                license_expression: "apache-2.0".to_string(),
1054                text: "Apache License".to_string(),
1055                tokens: vec![],
1056                rule_kind: crate::license_detection::models::RuleKind::Text,
1057                is_false_positive: false,
1058                is_required_phrase: false,
1059                is_from_license: false,
1060                relevance: 100,
1061                minimum_coverage: None,
1062                has_stored_minimum_coverage: false,
1063                is_continuous: false,
1064                required_phrase_spans: vec![],
1065                stopwords_by_pos: HashMap::new(),
1066                referenced_filenames: None,
1067                ignorable_urls: None,
1068                ignorable_emails: None,
1069                ignorable_copyrights: None,
1070                ignorable_holders: None,
1071                ignorable_authors: None,
1072                language: None,
1073                notes: None,
1074                length_unique: 0,
1075                high_length_unique: 0,
1076                high_length: 0,
1077                min_matched_length: 0,
1078                min_high_matched_length: 0,
1079                min_matched_length_unique: 0,
1080                min_high_matched_length_unique: 0,
1081                is_small: false,
1082                is_tiny: false,
1083                starts_with_license: false,
1084                ends_with_license: false,
1085                is_deprecated: false,
1086                spdx_license_key: None,
1087                other_spdx_license_keys: vec![],
1088            },
1089        ];
1090
1091        validate_rules(&rules);
1092    }
1093
1094    #[test]
1095    fn test_load_licenses_filters_deprecated_by_default() {
1096        let dir = tempdir().unwrap();
1097
1098        fs::write(
1099            dir.path().join("active.LICENSE"),
1100            r#"---
1101key: active
1102name: Active License
1103---
1104Active license text"#,
1105        )
1106        .unwrap();
1107
1108        fs::write(
1109            dir.path().join("deprecated.LICENSE"),
1110            r#"---
1111key: deprecated
1112name: Deprecated License
1113is_deprecated: yes
1114---
1115Deprecated license text"#,
1116        )
1117        .unwrap();
1118
1119        let licenses_without = load_licenses_from_directory(dir.path(), false).unwrap();
1120        assert_eq!(licenses_without.len(), 1);
1121        assert_eq!(licenses_without[0].key, "active");
1122
1123        let licenses_with = load_licenses_from_directory(dir.path(), true).unwrap();
1124        assert_eq!(licenses_with.len(), 2);
1125    }
1126
1127    #[test]
1128    fn test_load_rules_filters_deprecated_by_default() {
1129        let dir = tempdir().unwrap();
1130
1131        fs::write(
1132            dir.path().join("active.RULE"),
1133            r#"---
1134license_expression: active
1135is_license_notice: yes
1136---
1137Active rule text"#,
1138        )
1139        .unwrap();
1140
1141        fs::write(
1142            dir.path().join("deprecated.RULE"),
1143            r#"---
1144license_expression: deprecated
1145is_license_notice: yes
1146is_deprecated: yes
1147---
1148Deprecated rule text"#,
1149        )
1150        .unwrap();
1151
1152        let rules_without = load_rules_from_directory(dir.path(), false).unwrap();
1153        assert_eq!(rules_without.len(), 1);
1154        assert_eq!(rules_without[0].license_expression, "active");
1155
1156        let rules_with = load_rules_from_directory(dir.path(), true).unwrap();
1157        assert_eq!(rules_with.len(), 2);
1158    }
1159
1160    #[test]
1161    fn test_parse_rule_to_loaded() {
1162        let dir = tempdir().unwrap();
1163        let rule_path = dir.path().join("mit_1.RULE");
1164        fs::write(
1165            &rule_path,
1166            r#"---
1167license_expression: mit
1168is_license_reference: yes
1169relevance: 90
1170referenced_filenames:
1171    - MIT.txt
1172---
1173MIT.txt"#,
1174        )
1175        .unwrap();
1176
1177        let loaded = parse_rule_to_loaded(&rule_path).unwrap();
1178        assert_eq!(loaded.identifier, "mit_1.RULE");
1179        assert_eq!(loaded.license_expression, "mit");
1180        assert_eq!(loaded.text, "MIT.txt");
1181        assert_eq!(
1182            loaded.rule_kind,
1183            crate::license_detection::models::RuleKind::Reference
1184        );
1185        assert_eq!(loaded.relevance, Some(90));
1186        assert_eq!(
1187            loaded.referenced_filenames,
1188            Some(vec!["MIT.txt".to_string()])
1189        );
1190        assert!(!loaded.is_deprecated);
1191    }
1192
1193    #[test]
1194    fn test_parse_license_to_loaded() {
1195        let dir = tempdir().unwrap();
1196        let license_path = dir.path().join("mit.LICENSE");
1197        fs::write(
1198            &license_path,
1199            r#"---
1200key: mit
1201short_name: MIT License
1202name: MIT License
1203category: Permissive
1204spdx_license_key: MIT
1205---
1206MIT License text here"#,
1207        )
1208        .unwrap();
1209
1210        let loaded = parse_license_to_loaded(&license_path).unwrap();
1211        assert_eq!(loaded.key, "mit");
1212        assert_eq!(loaded.name, "MIT License");
1213        assert!(loaded.text.contains("MIT License text"));
1214        assert_eq!(loaded.spdx_license_key, Some("MIT".to_string()));
1215    }
1216
1217    #[test]
1218    fn test_load_loaded_rules_from_directory_includes_deprecated() {
1219        let dir = tempdir().unwrap();
1220
1221        fs::write(
1222            dir.path().join("active.RULE"),
1223            r#"---
1224license_expression: active
1225is_license_notice: yes
1226---
1227Active rule text"#,
1228        )
1229        .unwrap();
1230
1231        fs::write(
1232            dir.path().join("deprecated.RULE"),
1233            r#"---
1234license_expression: deprecated
1235is_license_notice: yes
1236is_deprecated: yes
1237---
1238Deprecated rule text"#,
1239        )
1240        .unwrap();
1241
1242        let loaded_rules = load_loaded_rules_from_directory(dir.path()).unwrap();
1243        assert_eq!(loaded_rules.len(), 2);
1244
1245        let active = loaded_rules
1246            .iter()
1247            .find(|r| r.license_expression == "active")
1248            .unwrap();
1249        assert!(!active.is_deprecated);
1250
1251        let deprecated = loaded_rules
1252            .iter()
1253            .find(|r| r.license_expression == "deprecated")
1254            .unwrap();
1255        assert!(deprecated.is_deprecated);
1256    }
1257
1258    #[test]
1259    fn test_load_loaded_licenses_from_directory_includes_deprecated() {
1260        let dir = tempdir().unwrap();
1261
1262        fs::write(
1263            dir.path().join("active.LICENSE"),
1264            r#"---
1265key: active
1266name: Active License
1267---
1268Active license text"#,
1269        )
1270        .unwrap();
1271
1272        fs::write(
1273            dir.path().join("deprecated.LICENSE"),
1274            r#"---
1275key: deprecated
1276name: Deprecated License
1277is_deprecated: yes
1278---
1279Deprecated license text"#,
1280        )
1281        .unwrap();
1282
1283        let loaded_licenses = load_loaded_licenses_from_directory(dir.path()).unwrap();
1284        assert_eq!(loaded_licenses.len(), 2);
1285
1286        let active = loaded_licenses.iter().find(|l| l.key == "active").unwrap();
1287        assert!(!active.is_deprecated);
1288
1289        let deprecated = loaded_licenses
1290            .iter()
1291            .find(|l| l.key == "deprecated")
1292            .unwrap();
1293        assert!(deprecated.is_deprecated);
1294    }
1295}