Skip to main content

provenant/parsers/
requirements_txt.rs

1//! Parser for pip requirements.txt files.
2//!
3//! Extracts Python package dependencies from requirements.txt files using PEP 508
4//! specification parsing with support for includes, environment markers, and URLs.
5//!
6//! # Supported Formats
7//! - requirements.txt (pip dependency specification files)
8//! - Supports includes: `-r requirements.txt`, `-c constraints.txt`
9//! - Supports markers: `package; python_version >= '3.6'`
10//! - Supports VCS refs: `git+https://...`, `git+ssh://...`
11//!
12//! # Key Features
13//! - PEP 508 requirement parsing with environment marker evaluation
14//! - Recursive file inclusion support (`-r` and `-c` directives)
15//! - VCS/URL dependency detection and handling
16//! - Package URL (purl) generation for PyPI packages
17//! - Line comment handling and continuation lines
18//!
19//! # Implementation Notes
20//! - Uses PEP 508 parser from `pep508` module
21//! - Recursively resolves included files relative to parent file
22//! - Comments (lines starting with `#`) are skipped
23//! - Environment markers are preserved for dependency filtering
24
25use std::collections::{HashMap, HashSet};
26use std::fs;
27use std::path::{Path, PathBuf};
28
29use crate::parser_warn as warn;
30use packageurl::PackageUrl;
31use serde_json::Value as JsonValue;
32
33use crate::models::{DatasourceId, Dependency, PackageData, PackageType};
34use crate::parsers::pep508::{Pep508Requirement, parse_pep508_requirement};
35
36use super::PackageParser;
37
38/// pip requirements.txt parser supporting PEP 508 dependency specifications.
39///
40/// Handles requirements.txt files with -r/-c includes, environment markers,
41/// and VCS/URL references. Recursively resolves included requirement files.
42pub struct RequirementsTxtParser;
43
44impl PackageParser for RequirementsTxtParser {
45    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
46
47    fn extract_packages(path: &Path) -> Vec<PackageData> {
48        vec![extract_from_requirements_txt(path)]
49    }
50
51    fn is_match(path: &Path) -> bool {
52        let filename = path.file_name().and_then(|name| name.to_str());
53        let Some(name) = filename else {
54            return false;
55        };
56
57        is_requirements_txt_filename(name)
58            || (is_requirements_like_extension(name) && has_requirements_like_ancestor(path))
59    }
60}
61
62fn is_requirements_txt_filename(name: &str) -> bool {
63    if name == "requirements.txt" || name == "requires.txt" {
64        return true;
65    }
66
67    let (stem, extension) = if let Some(stem) = name.strip_suffix(".txt") {
68        (stem, "txt")
69    } else if let Some(stem) = name.strip_suffix(".in") {
70        (stem, "in")
71    } else {
72        return false;
73    };
74
75    // Keep parity with ScanCode's documented *reqs.txt support while avoiding
76    // extending that broader alias to .in files or unrelated stems such as
77    // `prereqs.txt` that only happen to end with the same letters.
78    stem == "requirements"
79        || stem.starts_with("requirements")
80        || stem.ends_with("requirements")
81        || (extension == "txt" && is_reqs_alias_stem(stem))
82}
83
84fn is_reqs_alias_stem(stem: &str) -> bool {
85    stem == "reqs" || stem.ends_with("-reqs") || stem.ends_with("_reqs") || stem.ends_with(".reqs")
86}
87
88fn is_requirements_like_extension(name: &str) -> bool {
89    name.ends_with(".txt") || name.ends_with(".in")
90}
91
92fn has_requirements_like_ancestor(path: &Path) -> bool {
93    path.parent()
94        .into_iter()
95        .flat_map(Path::ancestors)
96        .filter_map(|ancestor| ancestor.file_name())
97        .filter_map(|name| name.to_str())
98        .any(is_requirements_like_dir_name)
99}
100
101fn is_requirements_like_dir_name(name: &str) -> bool {
102    name == "requirements" || name.starts_with("requirements") || name.ends_with("requirements")
103}
104
105struct ParseState {
106    dependencies: Vec<Dependency>,
107    extra_index_urls: Vec<String>,
108    index_url: Option<String>,
109    includes: Vec<String>,
110    constraints: Vec<String>,
111    visited: HashSet<PathBuf>,
112}
113
114fn extract_from_requirements_txt(path: &Path) -> PackageData {
115    let mut state = ParseState {
116        dependencies: Vec::new(),
117        extra_index_urls: Vec::new(),
118        index_url: None,
119        includes: Vec::new(),
120        constraints: Vec::new(),
121        visited: HashSet::new(),
122    };
123
124    let (scope, is_runtime) = scope_from_filename(path);
125
126    parse_requirements_with_includes(path, &mut state, &scope, is_runtime);
127
128    let mut extra_data = HashMap::new();
129    if let Some(url) = state.index_url {
130        extra_data.insert("index_url".to_string(), JsonValue::String(url));
131    }
132    if !state.extra_index_urls.is_empty() {
133        extra_data.insert(
134            "extra_index_urls".to_string(),
135            JsonValue::Array(
136                state
137                    .extra_index_urls
138                    .into_iter()
139                    .map(JsonValue::String)
140                    .collect(),
141            ),
142        );
143    }
144    if !state.includes.is_empty() {
145        extra_data.insert(
146            "requirements_includes".to_string(),
147            JsonValue::Array(state.includes.into_iter().map(JsonValue::String).collect()),
148        );
149    }
150    if !state.constraints.is_empty() {
151        extra_data.insert(
152            "constraints".to_string(),
153            JsonValue::Array(
154                state
155                    .constraints
156                    .into_iter()
157                    .map(JsonValue::String)
158                    .collect(),
159            ),
160        );
161    }
162
163    let extra_data = if extra_data.is_empty() {
164        None
165    } else {
166        Some(extra_data)
167    };
168
169    default_package_data(state.dependencies, extra_data)
170}
171
172fn parse_requirements_with_includes(
173    path: &Path,
174    state: &mut ParseState,
175    scope: &str,
176    is_runtime: bool,
177) {
178    let abs_path = match path.canonicalize() {
179        Ok(p) => p,
180        Err(_) => {
181            warn!("Cannot resolve path: {:?}", path);
182            return;
183        }
184    };
185
186    if state.visited.contains(&abs_path) {
187        warn!("Circular include detected: {:?}", path);
188        return;
189    }
190
191    state.visited.insert(abs_path.clone());
192
193    let content = match fs::read_to_string(&abs_path) {
194        Ok(c) => c,
195        Err(e) => {
196            warn!("Cannot read file {:?}: {}", abs_path, e);
197            return;
198        }
199    };
200
201    for line in collect_logical_lines(&content) {
202        let cleaned = strip_inline_comment(&line);
203        let trimmed = cleaned.trim();
204        if trimmed.is_empty() || trimmed.starts_with('#') {
205            continue;
206        }
207
208        if let Some(url) = parse_option_value(trimmed, "--extra-index-url") {
209            state.extra_index_urls.push(url);
210            continue;
211        }
212
213        if let Some(url) = parse_option_value(trimmed, "--index-url") {
214            state.index_url = Some(url);
215            continue;
216        }
217
218        if let Some(path_value) = parse_option_value(trimmed, "-r")
219            .or_else(|| parse_option_value(trimmed, "--requirement"))
220        {
221            state.includes.push(path_value.clone());
222            let included_path = abs_path
223                .parent()
224                .unwrap_or_else(|| Path::new("."))
225                .join(&path_value);
226
227            if included_path.exists() {
228                parse_requirements_with_includes(&included_path, state, scope, is_runtime);
229            } else {
230                warn!("Included file not found: {:?}", included_path);
231            }
232            continue;
233        }
234
235        if let Some(path_value) = parse_option_value(trimmed, "-c")
236            .or_else(|| parse_option_value(trimmed, "--constraint"))
237        {
238            state.constraints.push(path_value.clone());
239            let constraint_path = abs_path
240                .parent()
241                .unwrap_or_else(|| Path::new("."))
242                .join(&path_value);
243
244            if constraint_path.exists() {
245                parse_requirements_with_includes(&constraint_path, state, scope, is_runtime);
246            } else {
247                warn!("Constraint file not found: {:?}", constraint_path);
248            }
249            continue;
250        }
251
252        if trimmed.starts_with('-')
253            && !trimmed.starts_with("-e")
254            && !trimmed.starts_with("--editable")
255        {
256            continue;
257        }
258
259        if let Some(dependency) = build_dependency(trimmed, scope, is_runtime) {
260            state.dependencies.push(dependency);
261        }
262    }
263}
264
265fn default_package_data(
266    dependencies: Vec<Dependency>,
267    extra_data: Option<HashMap<String, JsonValue>>,
268) -> PackageData {
269    PackageData {
270        package_type: Some(RequirementsTxtParser::PACKAGE_TYPE),
271        primary_language: Some("Python".to_string()),
272        extra_data,
273        dependencies,
274        datasource_id: Some(DatasourceId::PipRequirements),
275        ..Default::default()
276    }
277}
278
279fn collect_logical_lines(content: &str) -> Vec<String> {
280    let mut lines = Vec::new();
281    let mut current = String::new();
282
283    for raw_line in content.lines() {
284        let line = raw_line.trim_end_matches('\r');
285        let trimmed = line.trim_end();
286        let is_continuation = trimmed.ends_with('\\');
287        let line_without = if is_continuation {
288            trimmed.trim_end_matches('\\')
289        } else {
290            line
291        };
292
293        if !line_without.trim().is_empty() {
294            if !current.is_empty() {
295                current.push(' ');
296            }
297            current.push_str(line_without.trim());
298        }
299
300        if !is_continuation && !current.is_empty() {
301            lines.push(current.trim().to_string());
302            current.clear();
303        }
304    }
305
306    if !current.is_empty() {
307        lines.push(current.trim().to_string());
308    }
309
310    lines
311}
312
313fn strip_inline_comment(line: &str) -> String {
314    let mut in_single = false;
315    let mut in_double = false;
316    for (idx, ch) in line.char_indices() {
317        match ch {
318            '\'' if !in_double => in_single = !in_single,
319            '"' if !in_single => in_double = !in_double,
320            '#' if !in_single && !in_double => {
321                let prefix = &line[..idx];
322                if prefix.trim_end().is_empty() || prefix.ends_with(char::is_whitespace) {
323                    return prefix.trim_end().to_string();
324                }
325            }
326            _ => {}
327        }
328    }
329    line.to_string()
330}
331
332fn parse_option_value(line: &str, option: &str) -> Option<String> {
333    let stripped = line.strip_prefix(option)?;
334    let mut rest = stripped.trim();
335    if let Some(rest_stripped) = rest.strip_prefix('=') {
336        rest = rest_stripped.trim();
337    }
338    if rest.is_empty() {
339        None
340    } else {
341        Some(rest.to_string())
342    }
343}
344
345fn scope_from_filename(path: &Path) -> (String, bool) {
346    let filename = path
347        .file_name()
348        .and_then(|name| name.to_str())
349        .unwrap_or_default()
350        .to_ascii_lowercase();
351
352    if filename.contains("dev") {
353        return ("develop".to_string(), false);
354    }
355    if filename.contains("test") {
356        return ("test".to_string(), false);
357    }
358    if filename.contains("doc") {
359        return ("docs".to_string(), false);
360    }
361
362    ("install".to_string(), true)
363}
364
365fn build_dependency(line: &str, scope: &str, is_runtime: bool) -> Option<Dependency> {
366    let trimmed = line.trim();
367    if trimmed.is_empty() {
368        return None;
369    }
370
371    let mut is_editable = false;
372    let mut requirement = trimmed.to_string();
373    let mut extracted_requirement = trimmed.to_string();
374
375    if let Some(rest) = trimmed.strip_prefix("-e") {
376        is_editable = true;
377        requirement = rest.trim().to_string();
378        extracted_requirement = format!("--editable {}", requirement);
379    } else if let Some(rest) = trimmed.strip_prefix("--editable") {
380        is_editable = true;
381        requirement = rest.trim().to_string();
382        extracted_requirement = format!("--editable {}", requirement);
383    }
384
385    let (requirement, hash_options) = split_hash_options(&requirement);
386    let requirement = requirement.trim();
387    if requirement.is_empty() {
388        return None;
389    }
390
391    if looks_like_hash_only_requirement(requirement) {
392        return None;
393    }
394
395    let parsed = parse_requirement(requirement);
396
397    let pinned_version = parsed
398        .specifiers
399        .as_deref()
400        .and_then(extract_pinned_version);
401    let is_pinned = pinned_version.is_some();
402
403    let purl = parsed
404        .name
405        .as_ref()
406        .and_then(|name| create_pypi_purl(name, pinned_version.as_deref()));
407
408    let mut extra_data = HashMap::new();
409    extra_data.insert("is_editable".to_string(), JsonValue::Bool(is_editable));
410    extra_data.insert(
411        "link".to_string(),
412        parsed
413            .link
414            .clone()
415            .map(JsonValue::String)
416            .unwrap_or(JsonValue::Null),
417    );
418    extra_data.insert(
419        "hash_options".to_string(),
420        JsonValue::Array(hash_options.into_iter().map(JsonValue::String).collect()),
421    );
422    extra_data.insert("is_constraint".to_string(), JsonValue::Bool(false));
423    extra_data.insert(
424        "is_archive".to_string(),
425        parsed
426            .is_archive
427            .map(JsonValue::Bool)
428            .unwrap_or(JsonValue::Null),
429    );
430    extra_data.insert("is_wheel".to_string(), JsonValue::Bool(parsed.is_wheel));
431    extra_data.insert(
432        "is_url".to_string(),
433        parsed
434            .is_url
435            .map(JsonValue::Bool)
436            .unwrap_or(JsonValue::Null),
437    );
438    extra_data.insert(
439        "is_vcs_url".to_string(),
440        parsed
441            .is_vcs_url
442            .map(JsonValue::Bool)
443            .unwrap_or(JsonValue::Null),
444    );
445    extra_data.insert(
446        "is_name_at_url".to_string(),
447        JsonValue::Bool(parsed.is_name_at_url),
448    );
449    extra_data.insert(
450        "is_local_path".to_string(),
451        parsed
452            .is_local_path
453            .map(|value| value || is_editable)
454            .map(JsonValue::Bool)
455            .unwrap_or(JsonValue::Null),
456    );
457
458    if let Some(marker) = parsed.marker {
459        extra_data.insert("markers".to_string(), JsonValue::String(marker));
460    }
461
462    Some(Dependency {
463        purl,
464        extracted_requirement: Some(extracted_requirement),
465        scope: Some(scope.to_string()),
466        is_runtime: Some(is_runtime),
467        is_optional: Some(false),
468        is_pinned: Some(is_pinned),
469        is_direct: Some(true),
470        resolved_package: None,
471        extra_data: Some(extra_data),
472    })
473}
474
475fn looks_like_hash_only_requirement(requirement: &str) -> bool {
476    let trimmed = requirement.trim();
477    if !matches!(trimmed.len(), 32 | 40 | 64 | 96 | 128) {
478        return false;
479    }
480
481    if trimmed.contains(char::is_whitespace)
482        || trimmed.contains(['[', ']', '@', ';', '/', '\\'])
483        || trimmed.contains("==")
484        || trimmed.contains("://")
485        || trimmed.contains("git+")
486    {
487        return false;
488    }
489
490    trimmed.chars().all(|ch| ch.is_ascii_hexdigit())
491}
492
493fn split_hash_options(input: &str) -> (String, Vec<String>) {
494    let mut filtered = Vec::new();
495    let mut hashes = Vec::new();
496
497    for token in input.split_whitespace() {
498        if let Some(value) = token.strip_prefix("--hash=") {
499            if !value.is_empty() {
500                hashes.push(value.to_string());
501            }
502        } else {
503            filtered.push(token);
504        }
505    }
506
507    (filtered.join(" "), hashes)
508}
509
510struct ParsedRequirement {
511    name: Option<String>,
512    specifiers: Option<String>,
513    marker: Option<String>,
514    link: Option<String>,
515    is_url: Option<bool>,
516    is_vcs_url: Option<bool>,
517    is_local_path: Option<bool>,
518    is_name_at_url: bool,
519    is_archive: Option<bool>,
520    is_wheel: bool,
521}
522
523fn parse_requirement(input: &str) -> ParsedRequirement {
524    if let Some(parsed) = parse_pep508_requirement(input) {
525        if let Some(url) = parsed.url.clone() {
526            return parsed_with_link(parsed, &url);
527        }
528
529        if !is_link_like(input) {
530            let name = Some(normalize_pypi_name(&parsed.name));
531            return ParsedRequirement {
532                name,
533                specifiers: parsed.specifiers,
534                marker: parsed.marker,
535                link: None,
536                is_url: None,
537                is_vcs_url: None,
538                is_local_path: None,
539                is_name_at_url: false,
540                is_archive: None,
541                is_wheel: false,
542            };
543        }
544    }
545
546    if let Some((name, link)) = parse_link_with_name(input) {
547        let normalized_name = normalize_pypi_name(&name);
548        let link_info = parse_link_flags(&link);
549        return ParsedRequirement {
550            name: Some(normalized_name),
551            specifiers: None,
552            marker: None,
553            link: Some(link),
554            is_url: Some(link_info.is_url),
555            is_vcs_url: Some(link_info.is_vcs_url),
556            is_local_path: Some(link_info.is_local_path),
557            is_name_at_url: link_info.is_name_at_url,
558            is_archive: link_info.is_archive,
559            is_wheel: link_info.is_wheel,
560        };
561    }
562
563    let link_info = parse_link_flags(input);
564    ParsedRequirement {
565        name: None,
566        specifiers: None,
567        marker: None,
568        link: Some(input.to_string()),
569        is_url: Some(link_info.is_url),
570        is_vcs_url: Some(link_info.is_vcs_url),
571        is_local_path: Some(link_info.is_local_path),
572        is_name_at_url: link_info.is_name_at_url,
573        is_archive: link_info.is_archive,
574        is_wheel: link_info.is_wheel,
575    }
576}
577
578fn parsed_with_link(parsed: Pep508Requirement, link: &str) -> ParsedRequirement {
579    let name = normalize_pypi_name(&parsed.name);
580    let link_info = parse_link_flags(link);
581    ParsedRequirement {
582        name: Some(name),
583        specifiers: parsed.specifiers,
584        marker: parsed.marker,
585        link: Some(link.to_string()),
586        is_url: Some(link_info.is_url),
587        is_vcs_url: Some(link_info.is_vcs_url),
588        is_local_path: Some(link_info.is_local_path),
589        is_name_at_url: parsed.is_name_at_url,
590        is_archive: link_info.is_archive,
591        is_wheel: link_info.is_wheel,
592    }
593}
594
595fn parse_link_with_name(input: &str) -> Option<(String, String)> {
596    if let Some(egg) = extract_egg_name(input) {
597        return Some((egg, input.to_string()));
598    }
599    None
600}
601
602fn extract_egg_name(input: &str) -> Option<String> {
603    let fragment = input.split('#').nth(1)?;
604    let egg_part = fragment.strip_prefix("egg=")?;
605    let name_part = egg_part.split('&').next()?.trim();
606    if name_part.is_empty() {
607        return None;
608    }
609    let (name, _extras, _) = parse_pep508_requirement(name_part)
610        .map(|parsed| (parsed.name, parsed.extras, parsed.specifiers))
611        .unwrap_or_else(|| (name_part.to_string(), Vec::new(), None));
612    Some(name)
613}
614
615struct LinkFlags {
616    is_url: bool,
617    is_vcs_url: bool,
618    is_local_path: bool,
619    is_name_at_url: bool,
620    is_archive: Option<bool>,
621    is_wheel: bool,
622}
623
624fn parse_link_flags(link: &str) -> LinkFlags {
625    let trimmed = link.trim();
626    let is_vcs_url = trimmed.starts_with("git+")
627        || trimmed.starts_with("hg+")
628        || trimmed.starts_with("svn+")
629        || trimmed.starts_with("bzr+");
630    let has_scheme = trimmed.contains("://") || trimmed.starts_with("file:");
631    let is_local_path = trimmed.starts_with("./")
632        || trimmed.starts_with("../")
633        || trimmed.starts_with('/')
634        || trimmed.starts_with('~')
635        || trimmed.starts_with("file:");
636
637    let is_wheel = trimmed.ends_with(".whl");
638    let is_archive = if is_wheel
639        || trimmed.ends_with(".zip")
640        || trimmed.ends_with(".tar.gz")
641        || trimmed.ends_with(".tgz")
642        || trimmed.ends_with(".tar.bz2")
643        || trimmed.ends_with(".tar")
644    {
645        Some(true)
646    } else if has_scheme || is_local_path {
647        Some(false)
648    } else {
649        None
650    };
651
652    LinkFlags {
653        is_url: has_scheme || is_vcs_url,
654        is_vcs_url,
655        is_local_path,
656        is_name_at_url: false,
657        is_archive,
658        is_wheel,
659    }
660}
661
662fn is_link_like(input: &str) -> bool {
663    let trimmed = input.trim();
664    trimmed.starts_with("git+")
665        || trimmed.starts_with("hg+")
666        || trimmed.starts_with("svn+")
667        || trimmed.starts_with("bzr+")
668        || trimmed.starts_with("file:")
669        || trimmed.contains("://")
670        || trimmed.starts_with("./")
671        || trimmed.starts_with("../")
672        || trimmed.starts_with('/')
673        || trimmed.starts_with('~')
674}
675
676fn extract_pinned_version(specifiers: &str) -> Option<String> {
677    let trimmed = specifiers.trim();
678    if trimmed.contains(',') {
679        return None;
680    }
681
682    let stripped = if let Some(version) = trimmed.strip_prefix("==") {
683        version
684    } else if let Some(version) = trimmed.strip_prefix("===") {
685        version
686    } else {
687        return None;
688    };
689
690    let version = stripped.trim();
691    if version.is_empty() {
692        None
693    } else {
694        Some(version.to_string())
695    }
696}
697
698fn create_pypi_purl(name: &str, version: Option<&str>) -> Option<String> {
699    PackageUrl::new(RequirementsTxtParser::PACKAGE_TYPE.as_str(), name)
700        .ok()
701        .map(|_| match version {
702            Some(version) => format!("pkg:pypi/{name}@{}", encode_pypi_purl_version(version)),
703            None => format!("pkg:pypi/{name}"),
704        })
705}
706
707fn encode_pypi_purl_version(version: &str) -> String {
708    version.replace('*', "%2A")
709}
710
711fn normalize_pypi_name(name: &str) -> String {
712    let lower = name.trim().to_ascii_lowercase();
713    let mut normalized = String::new();
714    let mut last_was_sep = false;
715    for ch in lower.chars() {
716        let is_sep = matches!(ch, '-' | '_' | '.');
717        if is_sep {
718            if !last_was_sep {
719                normalized.push('-');
720                last_was_sep = true;
721            }
722        } else {
723            normalized.push(ch);
724            last_was_sep = false;
725        }
726    }
727    normalized
728}
729
730crate::register_parser!(
731    "pip requirements file",
732    &[
733        "**/requirements*.txt",
734        "**/*requirements.txt",
735        "**/reqs.txt",
736        "**/*-reqs.txt",
737        "**/*_reqs.txt",
738        "**/*.reqs.txt",
739        "**/requirements*.in",
740        "**/*requirements.in",
741        "**/requires.txt",
742        "**/requirements/*.txt",
743        "**/requirements/*.in",
744        "**/requirements/**/*.txt",
745        "**/requirements/**/*.in",
746        "**/requirements*/*.txt",
747        "**/requirements*/*.in",
748        "**/requirements*/**/*.txt",
749        "**/requirements*/**/*.in"
750    ],
751    "pypi",
752    "Python",
753    Some("https://pip.pypa.io/en/latest/reference/requirements-file-format/"),
754);