Skip to main content

provenant/parsers/
requirements_txt.rs

1//! Parser for pip requirements.txt files.
2//!
3//! Extracts Python package dependencies from requirements.txt files using PEP 508
4//! specification parsing with support for includes, environment markers, and URLs.
5//!
6//! # Supported Formats
7//! - requirements.txt (pip dependency specification files)
8//! - Supports includes: `-r requirements.txt`, `-c constraints.txt`
9//! - Supports markers: `package; python_version >= '3.6'`
10//! - Supports VCS refs: `git+https://...`, `git+ssh://...`
11//!
12//! # Key Features
13//! - PEP 508 requirement parsing with environment marker evaluation
14//! - Recursive file inclusion support (`-r` and `-c` directives)
15//! - VCS/URL dependency detection and handling
16//! - Package URL (purl) generation for PyPI packages
17//! - Line comment handling and continuation lines
18//!
19//! # Implementation Notes
20//! - Uses PEP 508 parser from `pep508` module
21//! - Recursively resolves included files relative to parent file
22//! - Comments (lines starting with `#`) are skipped
23//! - Environment markers are preserved for dependency filtering
24
25use std::collections::{HashMap, HashSet};
26use std::fs;
27use std::path::{Path, PathBuf};
28
29use crate::parser_warn as warn;
30use packageurl::PackageUrl;
31use serde_json::Value as JsonValue;
32
33use crate::models::{DatasourceId, Dependency, PackageData, PackageType};
34use crate::parsers::pep508::{Pep508Requirement, parse_pep508_requirement};
35
36use super::PackageParser;
37
38/// pip requirements.txt parser supporting PEP 508 dependency specifications.
39///
40/// Handles requirements.txt files with -r/-c includes, environment markers,
41/// and VCS/URL references. Recursively resolves included requirement files.
42pub struct RequirementsTxtParser;
43
44impl PackageParser for RequirementsTxtParser {
45    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
46
47    fn extract_packages(path: &Path) -> Vec<PackageData> {
48        vec![extract_from_requirements_txt(path)]
49    }
50
51    fn is_match(path: &Path) -> bool {
52        let filename = path.file_name().and_then(|name| name.to_str());
53        let Some(name) = filename else {
54            return false;
55        };
56
57        is_requirements_txt_filename(name)
58            || (is_requirements_like_extension(name) && has_requirements_like_ancestor(path))
59    }
60}
61
62fn is_requirements_txt_filename(name: &str) -> bool {
63    if name == "requirements.txt" || name == "requires.txt" {
64        return true;
65    }
66
67    let Some(stem) = name
68        .strip_suffix(".txt")
69        .or_else(|| name.strip_suffix(".in"))
70    else {
71        return false;
72    };
73
74    stem == "requirements" || stem.starts_with("requirements") || stem.ends_with("requirements")
75}
76
77fn is_requirements_like_extension(name: &str) -> bool {
78    name.ends_with(".txt") || name.ends_with(".in")
79}
80
81fn has_requirements_like_ancestor(path: &Path) -> bool {
82    path.parent()
83        .into_iter()
84        .flat_map(Path::ancestors)
85        .filter_map(|ancestor| ancestor.file_name())
86        .filter_map(|name| name.to_str())
87        .any(is_requirements_like_dir_name)
88}
89
90fn is_requirements_like_dir_name(name: &str) -> bool {
91    name == "requirements" || name.starts_with("requirements") || name.ends_with("requirements")
92}
93
94struct ParseState {
95    dependencies: Vec<Dependency>,
96    extra_index_urls: Vec<String>,
97    index_url: Option<String>,
98    includes: Vec<String>,
99    constraints: Vec<String>,
100    visited: HashSet<PathBuf>,
101}
102
103fn extract_from_requirements_txt(path: &Path) -> PackageData {
104    let mut state = ParseState {
105        dependencies: Vec::new(),
106        extra_index_urls: Vec::new(),
107        index_url: None,
108        includes: Vec::new(),
109        constraints: Vec::new(),
110        visited: HashSet::new(),
111    };
112
113    let (scope, is_runtime) = scope_from_filename(path);
114
115    parse_requirements_with_includes(path, &mut state, &scope, is_runtime);
116
117    let mut extra_data = HashMap::new();
118    if let Some(url) = state.index_url {
119        extra_data.insert("index_url".to_string(), JsonValue::String(url));
120    }
121    if !state.extra_index_urls.is_empty() {
122        extra_data.insert(
123            "extra_index_urls".to_string(),
124            JsonValue::Array(
125                state
126                    .extra_index_urls
127                    .into_iter()
128                    .map(JsonValue::String)
129                    .collect(),
130            ),
131        );
132    }
133    if !state.includes.is_empty() {
134        extra_data.insert(
135            "requirements_includes".to_string(),
136            JsonValue::Array(state.includes.into_iter().map(JsonValue::String).collect()),
137        );
138    }
139    if !state.constraints.is_empty() {
140        extra_data.insert(
141            "constraints".to_string(),
142            JsonValue::Array(
143                state
144                    .constraints
145                    .into_iter()
146                    .map(JsonValue::String)
147                    .collect(),
148            ),
149        );
150    }
151
152    let extra_data = if extra_data.is_empty() {
153        None
154    } else {
155        Some(extra_data)
156    };
157
158    default_package_data(state.dependencies, extra_data)
159}
160
161fn parse_requirements_with_includes(
162    path: &Path,
163    state: &mut ParseState,
164    scope: &str,
165    is_runtime: bool,
166) {
167    let abs_path = match path.canonicalize() {
168        Ok(p) => p,
169        Err(_) => {
170            warn!("Cannot resolve path: {:?}", path);
171            return;
172        }
173    };
174
175    if state.visited.contains(&abs_path) {
176        warn!("Circular include detected: {:?}", path);
177        return;
178    }
179
180    state.visited.insert(abs_path.clone());
181
182    let content = match fs::read_to_string(&abs_path) {
183        Ok(c) => c,
184        Err(e) => {
185            warn!("Cannot read file {:?}: {}", abs_path, e);
186            return;
187        }
188    };
189
190    for line in collect_logical_lines(&content) {
191        let cleaned = strip_inline_comment(&line);
192        let trimmed = cleaned.trim();
193        if trimmed.is_empty() || trimmed.starts_with('#') {
194            continue;
195        }
196
197        if let Some(url) = parse_option_value(trimmed, "--extra-index-url") {
198            state.extra_index_urls.push(url);
199            continue;
200        }
201
202        if let Some(url) = parse_option_value(trimmed, "--index-url") {
203            state.index_url = Some(url);
204            continue;
205        }
206
207        if let Some(path_value) = parse_option_value(trimmed, "-r")
208            .or_else(|| parse_option_value(trimmed, "--requirement"))
209        {
210            state.includes.push(path_value.clone());
211            let included_path = abs_path
212                .parent()
213                .unwrap_or_else(|| Path::new("."))
214                .join(&path_value);
215
216            if included_path.exists() {
217                parse_requirements_with_includes(&included_path, state, scope, is_runtime);
218            } else {
219                warn!("Included file not found: {:?}", included_path);
220            }
221            continue;
222        }
223
224        if let Some(path_value) = parse_option_value(trimmed, "-c")
225            .or_else(|| parse_option_value(trimmed, "--constraint"))
226        {
227            state.constraints.push(path_value.clone());
228            let constraint_path = abs_path
229                .parent()
230                .unwrap_or_else(|| Path::new("."))
231                .join(&path_value);
232
233            if constraint_path.exists() {
234                parse_requirements_with_includes(&constraint_path, state, scope, is_runtime);
235            } else {
236                warn!("Constraint file not found: {:?}", constraint_path);
237            }
238            continue;
239        }
240
241        if trimmed.starts_with('-')
242            && !trimmed.starts_with("-e")
243            && !trimmed.starts_with("--editable")
244        {
245            continue;
246        }
247
248        if let Some(dependency) = build_dependency(trimmed, scope, is_runtime) {
249            state.dependencies.push(dependency);
250        }
251    }
252}
253
254fn default_package_data(
255    dependencies: Vec<Dependency>,
256    extra_data: Option<HashMap<String, JsonValue>>,
257) -> PackageData {
258    PackageData {
259        package_type: Some(RequirementsTxtParser::PACKAGE_TYPE),
260        primary_language: Some("Python".to_string()),
261        extra_data,
262        dependencies,
263        datasource_id: Some(DatasourceId::PipRequirements),
264        ..Default::default()
265    }
266}
267
268fn collect_logical_lines(content: &str) -> Vec<String> {
269    let mut lines = Vec::new();
270    let mut current = String::new();
271
272    for raw_line in content.lines() {
273        let line = raw_line.trim_end_matches('\r');
274        let trimmed = line.trim_end();
275        let is_continuation = trimmed.ends_with('\\');
276        let line_without = if is_continuation {
277            trimmed.trim_end_matches('\\')
278        } else {
279            line
280        };
281
282        if !line_without.trim().is_empty() {
283            if !current.is_empty() {
284                current.push(' ');
285            }
286            current.push_str(line_without.trim());
287        }
288
289        if !is_continuation && !current.is_empty() {
290            lines.push(current.trim().to_string());
291            current.clear();
292        }
293    }
294
295    if !current.is_empty() {
296        lines.push(current.trim().to_string());
297    }
298
299    lines
300}
301
302fn strip_inline_comment(line: &str) -> String {
303    let mut in_single = false;
304    let mut in_double = false;
305    for (idx, ch) in line.char_indices() {
306        match ch {
307            '\'' if !in_double => in_single = !in_single,
308            '"' if !in_single => in_double = !in_double,
309            '#' if !in_single && !in_double => {
310                let prefix = &line[..idx];
311                if prefix.trim_end().is_empty() || prefix.ends_with(char::is_whitespace) {
312                    return prefix.trim_end().to_string();
313                }
314            }
315            _ => {}
316        }
317    }
318    line.to_string()
319}
320
321fn parse_option_value(line: &str, option: &str) -> Option<String> {
322    let stripped = line.strip_prefix(option)?;
323    let mut rest = stripped.trim();
324    if let Some(rest_stripped) = rest.strip_prefix('=') {
325        rest = rest_stripped.trim();
326    }
327    if rest.is_empty() {
328        None
329    } else {
330        Some(rest.to_string())
331    }
332}
333
334fn scope_from_filename(path: &Path) -> (String, bool) {
335    let filename = path
336        .file_name()
337        .and_then(|name| name.to_str())
338        .unwrap_or_default()
339        .to_ascii_lowercase();
340
341    if filename.contains("dev") {
342        return ("develop".to_string(), false);
343    }
344    if filename.contains("test") {
345        return ("test".to_string(), false);
346    }
347    if filename.contains("doc") {
348        return ("docs".to_string(), false);
349    }
350
351    ("install".to_string(), true)
352}
353
354fn build_dependency(line: &str, scope: &str, is_runtime: bool) -> Option<Dependency> {
355    let trimmed = line.trim();
356    if trimmed.is_empty() {
357        return None;
358    }
359
360    let mut is_editable = false;
361    let mut requirement = trimmed.to_string();
362    let mut extracted_requirement = trimmed.to_string();
363
364    if let Some(rest) = trimmed.strip_prefix("-e") {
365        is_editable = true;
366        requirement = rest.trim().to_string();
367        extracted_requirement = format!("--editable {}", requirement);
368    } else if let Some(rest) = trimmed.strip_prefix("--editable") {
369        is_editable = true;
370        requirement = rest.trim().to_string();
371        extracted_requirement = format!("--editable {}", requirement);
372    }
373
374    let (requirement, hash_options) = split_hash_options(&requirement);
375    let requirement = requirement.trim();
376    if requirement.is_empty() {
377        return None;
378    }
379
380    if looks_like_hash_only_requirement(requirement) {
381        return None;
382    }
383
384    let parsed = parse_requirement(requirement);
385
386    let pinned_version = parsed
387        .specifiers
388        .as_deref()
389        .and_then(extract_pinned_version);
390    let is_pinned = pinned_version.is_some();
391
392    let purl = parsed
393        .name
394        .as_ref()
395        .and_then(|name| create_pypi_purl(name, pinned_version.as_deref()));
396
397    let mut extra_data = HashMap::new();
398    extra_data.insert("is_editable".to_string(), JsonValue::Bool(is_editable));
399    extra_data.insert(
400        "link".to_string(),
401        parsed
402            .link
403            .clone()
404            .map(JsonValue::String)
405            .unwrap_or(JsonValue::Null),
406    );
407    extra_data.insert(
408        "hash_options".to_string(),
409        JsonValue::Array(hash_options.into_iter().map(JsonValue::String).collect()),
410    );
411    extra_data.insert("is_constraint".to_string(), JsonValue::Bool(false));
412    extra_data.insert(
413        "is_archive".to_string(),
414        parsed
415            .is_archive
416            .map(JsonValue::Bool)
417            .unwrap_or(JsonValue::Null),
418    );
419    extra_data.insert("is_wheel".to_string(), JsonValue::Bool(parsed.is_wheel));
420    extra_data.insert(
421        "is_url".to_string(),
422        parsed
423            .is_url
424            .map(JsonValue::Bool)
425            .unwrap_or(JsonValue::Null),
426    );
427    extra_data.insert(
428        "is_vcs_url".to_string(),
429        parsed
430            .is_vcs_url
431            .map(JsonValue::Bool)
432            .unwrap_or(JsonValue::Null),
433    );
434    extra_data.insert(
435        "is_name_at_url".to_string(),
436        JsonValue::Bool(parsed.is_name_at_url),
437    );
438    extra_data.insert(
439        "is_local_path".to_string(),
440        parsed
441            .is_local_path
442            .map(|value| value || is_editable)
443            .map(JsonValue::Bool)
444            .unwrap_or(JsonValue::Null),
445    );
446
447    if let Some(marker) = parsed.marker {
448        extra_data.insert("markers".to_string(), JsonValue::String(marker));
449    }
450
451    Some(Dependency {
452        purl,
453        extracted_requirement: Some(extracted_requirement),
454        scope: Some(scope.to_string()),
455        is_runtime: Some(is_runtime),
456        is_optional: Some(false),
457        is_pinned: Some(is_pinned),
458        is_direct: Some(true),
459        resolved_package: None,
460        extra_data: Some(extra_data),
461    })
462}
463
464fn looks_like_hash_only_requirement(requirement: &str) -> bool {
465    let trimmed = requirement.trim();
466    if !matches!(trimmed.len(), 32 | 40 | 64 | 96 | 128) {
467        return false;
468    }
469
470    if trimmed.contains(char::is_whitespace)
471        || trimmed.contains(['[', ']', '@', ';', '/', '\\'])
472        || trimmed.contains("==")
473        || trimmed.contains("://")
474        || trimmed.contains("git+")
475    {
476        return false;
477    }
478
479    trimmed.chars().all(|ch| ch.is_ascii_hexdigit())
480}
481
482fn split_hash_options(input: &str) -> (String, Vec<String>) {
483    let mut filtered = Vec::new();
484    let mut hashes = Vec::new();
485
486    for token in input.split_whitespace() {
487        if let Some(value) = token.strip_prefix("--hash=") {
488            if !value.is_empty() {
489                hashes.push(value.to_string());
490            }
491        } else {
492            filtered.push(token);
493        }
494    }
495
496    (filtered.join(" "), hashes)
497}
498
499struct ParsedRequirement {
500    name: Option<String>,
501    specifiers: Option<String>,
502    marker: Option<String>,
503    link: Option<String>,
504    is_url: Option<bool>,
505    is_vcs_url: Option<bool>,
506    is_local_path: Option<bool>,
507    is_name_at_url: bool,
508    is_archive: Option<bool>,
509    is_wheel: bool,
510}
511
512fn parse_requirement(input: &str) -> ParsedRequirement {
513    if let Some(parsed) = parse_pep508_requirement(input) {
514        if let Some(url) = parsed.url.clone() {
515            return parsed_with_link(parsed, &url);
516        }
517
518        if !is_link_like(input) {
519            let name = Some(normalize_pypi_name(&parsed.name));
520            return ParsedRequirement {
521                name,
522                specifiers: parsed.specifiers,
523                marker: parsed.marker,
524                link: None,
525                is_url: None,
526                is_vcs_url: None,
527                is_local_path: None,
528                is_name_at_url: false,
529                is_archive: None,
530                is_wheel: false,
531            };
532        }
533    }
534
535    if let Some((name, link)) = parse_link_with_name(input) {
536        let normalized_name = normalize_pypi_name(&name);
537        let link_info = parse_link_flags(&link);
538        return ParsedRequirement {
539            name: Some(normalized_name),
540            specifiers: None,
541            marker: None,
542            link: Some(link),
543            is_url: Some(link_info.is_url),
544            is_vcs_url: Some(link_info.is_vcs_url),
545            is_local_path: Some(link_info.is_local_path),
546            is_name_at_url: link_info.is_name_at_url,
547            is_archive: link_info.is_archive,
548            is_wheel: link_info.is_wheel,
549        };
550    }
551
552    let link_info = parse_link_flags(input);
553    ParsedRequirement {
554        name: None,
555        specifiers: None,
556        marker: None,
557        link: Some(input.to_string()),
558        is_url: Some(link_info.is_url),
559        is_vcs_url: Some(link_info.is_vcs_url),
560        is_local_path: Some(link_info.is_local_path),
561        is_name_at_url: link_info.is_name_at_url,
562        is_archive: link_info.is_archive,
563        is_wheel: link_info.is_wheel,
564    }
565}
566
567fn parsed_with_link(parsed: Pep508Requirement, link: &str) -> ParsedRequirement {
568    let name = normalize_pypi_name(&parsed.name);
569    let link_info = parse_link_flags(link);
570    ParsedRequirement {
571        name: Some(name),
572        specifiers: parsed.specifiers,
573        marker: parsed.marker,
574        link: Some(link.to_string()),
575        is_url: Some(link_info.is_url),
576        is_vcs_url: Some(link_info.is_vcs_url),
577        is_local_path: Some(link_info.is_local_path),
578        is_name_at_url: parsed.is_name_at_url,
579        is_archive: link_info.is_archive,
580        is_wheel: link_info.is_wheel,
581    }
582}
583
584fn parse_link_with_name(input: &str) -> Option<(String, String)> {
585    if let Some(egg) = extract_egg_name(input) {
586        return Some((egg, input.to_string()));
587    }
588    None
589}
590
591fn extract_egg_name(input: &str) -> Option<String> {
592    let fragment = input.split('#').nth(1)?;
593    let egg_part = fragment.strip_prefix("egg=")?;
594    let name_part = egg_part.split('&').next()?.trim();
595    if name_part.is_empty() {
596        return None;
597    }
598    let (name, _extras, _) = parse_pep508_requirement(name_part)
599        .map(|parsed| (parsed.name, parsed.extras, parsed.specifiers))
600        .unwrap_or_else(|| (name_part.to_string(), Vec::new(), None));
601    Some(name)
602}
603
604struct LinkFlags {
605    is_url: bool,
606    is_vcs_url: bool,
607    is_local_path: bool,
608    is_name_at_url: bool,
609    is_archive: Option<bool>,
610    is_wheel: bool,
611}
612
613fn parse_link_flags(link: &str) -> LinkFlags {
614    let trimmed = link.trim();
615    let is_vcs_url = trimmed.starts_with("git+")
616        || trimmed.starts_with("hg+")
617        || trimmed.starts_with("svn+")
618        || trimmed.starts_with("bzr+");
619    let has_scheme = trimmed.contains("://") || trimmed.starts_with("file:");
620    let is_local_path = trimmed.starts_with("./")
621        || trimmed.starts_with("../")
622        || trimmed.starts_with('/')
623        || trimmed.starts_with('~')
624        || trimmed.starts_with("file:");
625
626    let is_wheel = trimmed.ends_with(".whl");
627    let is_archive = if is_wheel
628        || trimmed.ends_with(".zip")
629        || trimmed.ends_with(".tar.gz")
630        || trimmed.ends_with(".tgz")
631        || trimmed.ends_with(".tar.bz2")
632        || trimmed.ends_with(".tar")
633    {
634        Some(true)
635    } else if has_scheme || is_local_path {
636        Some(false)
637    } else {
638        None
639    };
640
641    LinkFlags {
642        is_url: has_scheme || is_vcs_url,
643        is_vcs_url,
644        is_local_path,
645        is_name_at_url: false,
646        is_archive,
647        is_wheel,
648    }
649}
650
651fn is_link_like(input: &str) -> bool {
652    let trimmed = input.trim();
653    trimmed.starts_with("git+")
654        || trimmed.starts_with("hg+")
655        || trimmed.starts_with("svn+")
656        || trimmed.starts_with("bzr+")
657        || trimmed.starts_with("file:")
658        || trimmed.contains("://")
659        || trimmed.starts_with("./")
660        || trimmed.starts_with("../")
661        || trimmed.starts_with('/')
662        || trimmed.starts_with('~')
663}
664
665fn extract_pinned_version(specifiers: &str) -> Option<String> {
666    let trimmed = specifiers.trim();
667    if trimmed.contains(',') {
668        return None;
669    }
670
671    let stripped = if let Some(version) = trimmed.strip_prefix("==") {
672        version
673    } else if let Some(version) = trimmed.strip_prefix("===") {
674        version
675    } else {
676        return None;
677    };
678
679    let version = stripped.trim();
680    if version.is_empty() {
681        None
682    } else {
683        Some(version.to_string())
684    }
685}
686
687fn create_pypi_purl(name: &str, version: Option<&str>) -> Option<String> {
688    PackageUrl::new(RequirementsTxtParser::PACKAGE_TYPE.as_str(), name)
689        .ok()
690        .map(|_| match version {
691            Some(version) => format!("pkg:pypi/{name}@{}", encode_pypi_purl_version(version)),
692            None => format!("pkg:pypi/{name}"),
693        })
694}
695
696fn encode_pypi_purl_version(version: &str) -> String {
697    version.replace('*', "%2A")
698}
699
700fn normalize_pypi_name(name: &str) -> String {
701    let lower = name.trim().to_ascii_lowercase();
702    let mut normalized = String::new();
703    let mut last_was_sep = false;
704    for ch in lower.chars() {
705        let is_sep = matches!(ch, '-' | '_' | '.');
706        if is_sep {
707            if !last_was_sep {
708                normalized.push('-');
709                last_was_sep = true;
710            }
711        } else {
712            normalized.push(ch);
713            last_was_sep = false;
714        }
715    }
716    normalized
717}
718
719crate::register_parser!(
720    "pip requirements file",
721    &[
722        "**/requirements*.txt",
723        "**/*requirements.txt",
724        "**/requirements*.in",
725        "**/*requirements.in",
726        "**/requires.txt",
727        "**/requirements/*.txt",
728        "**/requirements/*.in",
729        "**/requirements/**/*.txt",
730        "**/requirements/**/*.in",
731        "**/requirements*/*.txt",
732        "**/requirements*/*.in",
733        "**/requirements*/**/*.txt",
734        "**/requirements*/**/*.in"
735    ],
736    "pypi",
737    "Python",
738    Some("https://pip.pypa.io/en/latest/reference/requirements-file-format/"),
739);