Skip to main content

provenant/parsers/
requirements_txt.rs

1//! Parser for pip requirements.txt files.
2//!
3//! Extracts Python package dependencies from requirements.txt files using PEP 508
4//! specification parsing with support for includes, environment markers, and URLs.
5//!
6//! # Supported Formats
7//! - requirements.txt (pip dependency specification files)
8//! - Supports includes: `-r requirements.txt`, `-c constraints.txt`
9//! - Supports markers: `package; python_version >= '3.6'`
10//! - Supports VCS refs: `git+https://...`, `git+ssh://...`
11//!
12//! # Key Features
13//! - PEP 508 requirement parsing with environment marker evaluation
14//! - Recursive file inclusion support (`-r` and `-c` directives)
15//! - VCS/URL dependency detection and handling
16//! - Package URL (purl) generation for PyPI packages
17//! - Line comment handling and continuation lines
18//!
19//! # Implementation Notes
20//! - Uses PEP 508 parser from `pep508` module
21//! - Recursively resolves included files relative to parent file
22//! - Comments (lines starting with `#`) are skipped
23//! - Environment markers are preserved for dependency filtering
24
25use std::collections::{HashMap, HashSet};
26use std::fs;
27use std::path::{Path, PathBuf};
28
29use crate::parser_warn as warn;
30use packageurl::PackageUrl;
31use serde_json::Value as JsonValue;
32
33use crate::models::{DatasourceId, Dependency, PackageData, PackageType};
34use crate::parsers::pep508::{Pep508Requirement, parse_pep508_requirement};
35
36use super::PackageParser;
37
38/// pip requirements.txt parser supporting PEP 508 dependency specifications.
39///
40/// Handles requirements.txt files with -r/-c includes, environment markers,
41/// and VCS/URL references. Recursively resolves included requirement files.
42pub struct RequirementsTxtParser;
43
44impl PackageParser for RequirementsTxtParser {
45    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
46
47    fn extract_packages(path: &Path) -> Vec<PackageData> {
48        vec![extract_from_requirements_txt(path)]
49    }
50
51    fn is_match(path: &Path) -> bool {
52        let filename = path.file_name().and_then(|name| name.to_str());
53        let parent_name = path
54            .parent()
55            .and_then(|parent| parent.file_name())
56            .and_then(|name| name.to_str());
57
58        if let Some(name) = filename
59            && (is_requirements_txt_filename(name)
60                || (parent_name == Some("requirements") && name.ends_with(".txt")))
61        {
62            return true;
63        }
64
65        false
66    }
67}
68
69fn is_requirements_txt_filename(name: &str) -> bool {
70    if name == "requirements.txt" {
71        return true;
72    }
73
74    let Some(suffix) = name
75        .strip_prefix("requirements")
76        .and_then(|suffix| suffix.strip_suffix(".txt"))
77    else {
78        return false;
79    };
80
81    suffix.is_empty() || suffix.starts_with('-') || suffix.starts_with('_')
82}
83
84struct ParseState {
85    dependencies: Vec<Dependency>,
86    extra_index_urls: Vec<String>,
87    index_url: Option<String>,
88    includes: Vec<String>,
89    constraints: Vec<String>,
90    visited: HashSet<PathBuf>,
91}
92
93fn extract_from_requirements_txt(path: &Path) -> PackageData {
94    let mut state = ParseState {
95        dependencies: Vec::new(),
96        extra_index_urls: Vec::new(),
97        index_url: None,
98        includes: Vec::new(),
99        constraints: Vec::new(),
100        visited: HashSet::new(),
101    };
102
103    let (scope, is_runtime) = scope_from_filename(path);
104
105    parse_requirements_with_includes(path, &mut state, &scope, is_runtime);
106
107    let mut extra_data = HashMap::new();
108    if let Some(url) = state.index_url {
109        extra_data.insert("index_url".to_string(), JsonValue::String(url));
110    }
111    if !state.extra_index_urls.is_empty() {
112        extra_data.insert(
113            "extra_index_urls".to_string(),
114            JsonValue::Array(
115                state
116                    .extra_index_urls
117                    .into_iter()
118                    .map(JsonValue::String)
119                    .collect(),
120            ),
121        );
122    }
123    if !state.includes.is_empty() {
124        extra_data.insert(
125            "requirements_includes".to_string(),
126            JsonValue::Array(state.includes.into_iter().map(JsonValue::String).collect()),
127        );
128    }
129    if !state.constraints.is_empty() {
130        extra_data.insert(
131            "constraints".to_string(),
132            JsonValue::Array(
133                state
134                    .constraints
135                    .into_iter()
136                    .map(JsonValue::String)
137                    .collect(),
138            ),
139        );
140    }
141
142    let extra_data = if extra_data.is_empty() {
143        None
144    } else {
145        Some(extra_data)
146    };
147
148    default_package_data(state.dependencies, extra_data)
149}
150
151fn parse_requirements_with_includes(
152    path: &Path,
153    state: &mut ParseState,
154    scope: &str,
155    is_runtime: bool,
156) {
157    let abs_path = match path.canonicalize() {
158        Ok(p) => p,
159        Err(_) => {
160            warn!("Cannot resolve path: {:?}", path);
161            return;
162        }
163    };
164
165    if state.visited.contains(&abs_path) {
166        warn!("Circular include detected: {:?}", path);
167        return;
168    }
169
170    state.visited.insert(abs_path.clone());
171
172    let content = match fs::read_to_string(&abs_path) {
173        Ok(c) => c,
174        Err(e) => {
175            warn!("Cannot read file {:?}: {}", abs_path, e);
176            return;
177        }
178    };
179
180    for line in collect_logical_lines(&content) {
181        let cleaned = strip_inline_comment(&line);
182        let trimmed = cleaned.trim();
183        if trimmed.is_empty() || trimmed.starts_with('#') {
184            continue;
185        }
186
187        if let Some(url) = parse_option_value(trimmed, "--extra-index-url") {
188            state.extra_index_urls.push(url);
189            continue;
190        }
191
192        if let Some(url) = parse_option_value(trimmed, "--index-url") {
193            state.index_url = Some(url);
194            continue;
195        }
196
197        if let Some(path_value) = parse_option_value(trimmed, "-r")
198            .or_else(|| parse_option_value(trimmed, "--requirement"))
199        {
200            state.includes.push(path_value.clone());
201            let included_path = abs_path
202                .parent()
203                .unwrap_or_else(|| Path::new("."))
204                .join(&path_value);
205
206            if included_path.exists() {
207                parse_requirements_with_includes(&included_path, state, scope, is_runtime);
208            } else {
209                warn!("Included file not found: {:?}", included_path);
210            }
211            continue;
212        }
213
214        if let Some(path_value) = parse_option_value(trimmed, "-c")
215            .or_else(|| parse_option_value(trimmed, "--constraint"))
216        {
217            state.constraints.push(path_value.clone());
218            let constraint_path = abs_path
219                .parent()
220                .unwrap_or_else(|| Path::new("."))
221                .join(&path_value);
222
223            if constraint_path.exists() {
224                parse_requirements_with_includes(&constraint_path, state, scope, is_runtime);
225            } else {
226                warn!("Constraint file not found: {:?}", constraint_path);
227            }
228            continue;
229        }
230
231        if trimmed.starts_with('-')
232            && !trimmed.starts_with("-e")
233            && !trimmed.starts_with("--editable")
234        {
235            continue;
236        }
237
238        if let Some(dependency) = build_dependency(trimmed, scope, is_runtime) {
239            state.dependencies.push(dependency);
240        }
241    }
242}
243
244fn default_package_data(
245    dependencies: Vec<Dependency>,
246    extra_data: Option<HashMap<String, JsonValue>>,
247) -> PackageData {
248    PackageData {
249        package_type: Some(RequirementsTxtParser::PACKAGE_TYPE),
250        primary_language: Some("Python".to_string()),
251        extra_data,
252        dependencies,
253        datasource_id: Some(DatasourceId::PipRequirements),
254        ..Default::default()
255    }
256}
257
258fn collect_logical_lines(content: &str) -> Vec<String> {
259    let mut lines = Vec::new();
260    let mut current = String::new();
261
262    for raw_line in content.lines() {
263        let line = raw_line.trim_end_matches('\r');
264        let trimmed = line.trim_end();
265        let is_continuation = trimmed.ends_with('\\');
266        let line_without = if is_continuation {
267            trimmed.trim_end_matches('\\')
268        } else {
269            line
270        };
271
272        if !line_without.trim().is_empty() {
273            if !current.is_empty() {
274                current.push(' ');
275            }
276            current.push_str(line_without.trim());
277        }
278
279        if !is_continuation && !current.is_empty() {
280            lines.push(current.trim().to_string());
281            current.clear();
282        }
283    }
284
285    if !current.is_empty() {
286        lines.push(current.trim().to_string());
287    }
288
289    lines
290}
291
292fn strip_inline_comment(line: &str) -> String {
293    let mut in_single = false;
294    let mut in_double = false;
295    for (idx, ch) in line.char_indices() {
296        match ch {
297            '\'' if !in_double => in_single = !in_single,
298            '"' if !in_single => in_double = !in_double,
299            '#' if !in_single && !in_double => {
300                let prefix = &line[..idx];
301                if prefix.trim_end().is_empty() || prefix.ends_with(char::is_whitespace) {
302                    return prefix.trim_end().to_string();
303                }
304            }
305            _ => {}
306        }
307    }
308    line.to_string()
309}
310
311fn parse_option_value(line: &str, option: &str) -> Option<String> {
312    let stripped = line.strip_prefix(option)?;
313    let mut rest = stripped.trim();
314    if let Some(rest_stripped) = rest.strip_prefix('=') {
315        rest = rest_stripped.trim();
316    }
317    if rest.is_empty() {
318        None
319    } else {
320        Some(rest.to_string())
321    }
322}
323
324fn scope_from_filename(path: &Path) -> (String, bool) {
325    let filename = path
326        .file_name()
327        .and_then(|name| name.to_str())
328        .unwrap_or_default()
329        .to_ascii_lowercase();
330
331    if filename.contains("dev") {
332        return ("develop".to_string(), false);
333    }
334    if filename.contains("test") {
335        return ("test".to_string(), false);
336    }
337    if filename.contains("doc") {
338        return ("docs".to_string(), false);
339    }
340
341    ("install".to_string(), true)
342}
343
344fn build_dependency(line: &str, scope: &str, is_runtime: bool) -> Option<Dependency> {
345    let trimmed = line.trim();
346    if trimmed.is_empty() {
347        return None;
348    }
349
350    let mut is_editable = false;
351    let mut requirement = trimmed.to_string();
352    let mut extracted_requirement = trimmed.to_string();
353
354    if let Some(rest) = trimmed.strip_prefix("-e") {
355        is_editable = true;
356        requirement = rest.trim().to_string();
357        extracted_requirement = format!("--editable {}", requirement);
358    } else if let Some(rest) = trimmed.strip_prefix("--editable") {
359        is_editable = true;
360        requirement = rest.trim().to_string();
361        extracted_requirement = format!("--editable {}", requirement);
362    }
363
364    let (requirement, hash_options) = split_hash_options(&requirement);
365    let requirement = requirement.trim();
366    if requirement.is_empty() {
367        return None;
368    }
369
370    let parsed = parse_requirement(requirement);
371
372    let pinned_version = parsed
373        .specifiers
374        .as_deref()
375        .and_then(extract_pinned_version);
376    let is_pinned = pinned_version.is_some();
377
378    let purl = parsed
379        .name
380        .as_ref()
381        .and_then(|name| create_pypi_purl(name, pinned_version.as_deref()));
382
383    let mut extra_data = HashMap::new();
384    extra_data.insert("is_editable".to_string(), JsonValue::Bool(is_editable));
385    extra_data.insert(
386        "link".to_string(),
387        parsed
388            .link
389            .clone()
390            .map(JsonValue::String)
391            .unwrap_or(JsonValue::Null),
392    );
393    extra_data.insert(
394        "hash_options".to_string(),
395        JsonValue::Array(hash_options.into_iter().map(JsonValue::String).collect()),
396    );
397    extra_data.insert("is_constraint".to_string(), JsonValue::Bool(false));
398    extra_data.insert(
399        "is_archive".to_string(),
400        parsed
401            .is_archive
402            .map(JsonValue::Bool)
403            .unwrap_or(JsonValue::Null),
404    );
405    extra_data.insert("is_wheel".to_string(), JsonValue::Bool(parsed.is_wheel));
406    extra_data.insert(
407        "is_url".to_string(),
408        parsed
409            .is_url
410            .map(JsonValue::Bool)
411            .unwrap_or(JsonValue::Null),
412    );
413    extra_data.insert(
414        "is_vcs_url".to_string(),
415        parsed
416            .is_vcs_url
417            .map(JsonValue::Bool)
418            .unwrap_or(JsonValue::Null),
419    );
420    extra_data.insert(
421        "is_name_at_url".to_string(),
422        JsonValue::Bool(parsed.is_name_at_url),
423    );
424    extra_data.insert(
425        "is_local_path".to_string(),
426        parsed
427            .is_local_path
428            .map(|value| value || is_editable)
429            .map(JsonValue::Bool)
430            .unwrap_or(JsonValue::Null),
431    );
432
433    if let Some(marker) = parsed.marker {
434        extra_data.insert("markers".to_string(), JsonValue::String(marker));
435    }
436
437    Some(Dependency {
438        purl,
439        extracted_requirement: Some(extracted_requirement),
440        scope: Some(scope.to_string()),
441        is_runtime: Some(is_runtime),
442        is_optional: Some(false),
443        is_pinned: Some(is_pinned),
444        is_direct: Some(true),
445        resolved_package: None,
446        extra_data: Some(extra_data),
447    })
448}
449
450fn split_hash_options(input: &str) -> (String, Vec<String>) {
451    let mut filtered = Vec::new();
452    let mut hashes = Vec::new();
453
454    for token in input.split_whitespace() {
455        if let Some(value) = token.strip_prefix("--hash=") {
456            if !value.is_empty() {
457                hashes.push(value.to_string());
458            }
459        } else {
460            filtered.push(token);
461        }
462    }
463
464    (filtered.join(" "), hashes)
465}
466
467struct ParsedRequirement {
468    name: Option<String>,
469    specifiers: Option<String>,
470    marker: Option<String>,
471    link: Option<String>,
472    is_url: Option<bool>,
473    is_vcs_url: Option<bool>,
474    is_local_path: Option<bool>,
475    is_name_at_url: bool,
476    is_archive: Option<bool>,
477    is_wheel: bool,
478}
479
480fn parse_requirement(input: &str) -> ParsedRequirement {
481    if let Some(parsed) = parse_pep508_requirement(input) {
482        if let Some(url) = parsed.url.clone() {
483            return parsed_with_link(parsed, &url);
484        }
485
486        if !is_link_like(input) {
487            let name = Some(normalize_pypi_name(&parsed.name));
488            return ParsedRequirement {
489                name,
490                specifiers: parsed.specifiers,
491                marker: parsed.marker,
492                link: None,
493                is_url: None,
494                is_vcs_url: None,
495                is_local_path: None,
496                is_name_at_url: false,
497                is_archive: None,
498                is_wheel: false,
499            };
500        }
501    }
502
503    if let Some((name, link)) = parse_link_with_name(input) {
504        let normalized_name = normalize_pypi_name(&name);
505        let link_info = parse_link_flags(&link);
506        return ParsedRequirement {
507            name: Some(normalized_name),
508            specifiers: None,
509            marker: None,
510            link: Some(link),
511            is_url: Some(link_info.is_url),
512            is_vcs_url: Some(link_info.is_vcs_url),
513            is_local_path: Some(link_info.is_local_path),
514            is_name_at_url: link_info.is_name_at_url,
515            is_archive: link_info.is_archive,
516            is_wheel: link_info.is_wheel,
517        };
518    }
519
520    let link_info = parse_link_flags(input);
521    ParsedRequirement {
522        name: None,
523        specifiers: None,
524        marker: None,
525        link: Some(input.to_string()),
526        is_url: Some(link_info.is_url),
527        is_vcs_url: Some(link_info.is_vcs_url),
528        is_local_path: Some(link_info.is_local_path),
529        is_name_at_url: link_info.is_name_at_url,
530        is_archive: link_info.is_archive,
531        is_wheel: link_info.is_wheel,
532    }
533}
534
535fn parsed_with_link(parsed: Pep508Requirement, link: &str) -> ParsedRequirement {
536    let name = normalize_pypi_name(&parsed.name);
537    let link_info = parse_link_flags(link);
538    ParsedRequirement {
539        name: Some(name),
540        specifiers: parsed.specifiers,
541        marker: parsed.marker,
542        link: Some(link.to_string()),
543        is_url: Some(link_info.is_url),
544        is_vcs_url: Some(link_info.is_vcs_url),
545        is_local_path: Some(link_info.is_local_path),
546        is_name_at_url: parsed.is_name_at_url,
547        is_archive: link_info.is_archive,
548        is_wheel: link_info.is_wheel,
549    }
550}
551
552fn parse_link_with_name(input: &str) -> Option<(String, String)> {
553    if let Some(egg) = extract_egg_name(input) {
554        return Some((egg, input.to_string()));
555    }
556    None
557}
558
559fn extract_egg_name(input: &str) -> Option<String> {
560    let fragment = input.split('#').nth(1)?;
561    let egg_part = fragment.strip_prefix("egg=")?;
562    let name_part = egg_part.split('&').next()?.trim();
563    if name_part.is_empty() {
564        return None;
565    }
566    let (name, _extras, _) = parse_pep508_requirement(name_part)
567        .map(|parsed| (parsed.name, parsed.extras, parsed.specifiers))
568        .unwrap_or_else(|| (name_part.to_string(), Vec::new(), None));
569    Some(name)
570}
571
572struct LinkFlags {
573    is_url: bool,
574    is_vcs_url: bool,
575    is_local_path: bool,
576    is_name_at_url: bool,
577    is_archive: Option<bool>,
578    is_wheel: bool,
579}
580
581fn parse_link_flags(link: &str) -> LinkFlags {
582    let trimmed = link.trim();
583    let is_vcs_url = trimmed.starts_with("git+")
584        || trimmed.starts_with("hg+")
585        || trimmed.starts_with("svn+")
586        || trimmed.starts_with("bzr+");
587    let has_scheme = trimmed.contains("://") || trimmed.starts_with("file:");
588    let is_local_path = trimmed.starts_with("./")
589        || trimmed.starts_with("../")
590        || trimmed.starts_with('/')
591        || trimmed.starts_with('~')
592        || trimmed.starts_with("file:");
593
594    let is_wheel = trimmed.ends_with(".whl");
595    let is_archive = if is_wheel
596        || trimmed.ends_with(".zip")
597        || trimmed.ends_with(".tar.gz")
598        || trimmed.ends_with(".tgz")
599        || trimmed.ends_with(".tar.bz2")
600        || trimmed.ends_with(".tar")
601    {
602        Some(true)
603    } else if has_scheme || is_local_path {
604        Some(false)
605    } else {
606        None
607    };
608
609    LinkFlags {
610        is_url: has_scheme || is_vcs_url,
611        is_vcs_url,
612        is_local_path,
613        is_name_at_url: false,
614        is_archive,
615        is_wheel,
616    }
617}
618
619fn is_link_like(input: &str) -> bool {
620    let trimmed = input.trim();
621    trimmed.starts_with("git+")
622        || trimmed.starts_with("hg+")
623        || trimmed.starts_with("svn+")
624        || trimmed.starts_with("bzr+")
625        || trimmed.starts_with("file:")
626        || trimmed.contains("://")
627        || trimmed.starts_with("./")
628        || trimmed.starts_with("../")
629        || trimmed.starts_with('/')
630        || trimmed.starts_with('~')
631}
632
633fn extract_pinned_version(specifiers: &str) -> Option<String> {
634    let trimmed = specifiers.trim();
635    if trimmed.contains(',') {
636        return None;
637    }
638
639    let stripped = if let Some(version) = trimmed.strip_prefix("==") {
640        version
641    } else if let Some(version) = trimmed.strip_prefix("===") {
642        version
643    } else {
644        return None;
645    };
646
647    let version = stripped.trim();
648    if version.is_empty() || version.contains('*') {
649        None
650    } else {
651        Some(version.to_string())
652    }
653}
654
655fn create_pypi_purl(name: &str, version: Option<&str>) -> Option<String> {
656    let mut purl = PackageUrl::new(RequirementsTxtParser::PACKAGE_TYPE.as_str(), name).ok()?;
657    if let Some(version) = version {
658        purl.with_version(version).ok()?;
659    }
660    Some(purl.to_string())
661}
662
663fn normalize_pypi_name(name: &str) -> String {
664    let lower = name.trim().to_ascii_lowercase();
665    let mut normalized = String::new();
666    let mut last_was_sep = false;
667    for ch in lower.chars() {
668        let is_sep = matches!(ch, '-' | '_' | '.');
669        if is_sep {
670            if !last_was_sep {
671                normalized.push('-');
672                last_was_sep = true;
673            }
674        } else {
675            normalized.push(ch);
676            last_was_sep = false;
677        }
678    }
679    normalized
680}
681
682crate::register_parser!(
683    "pip requirements file",
684    &[
685        "**/requirements*.txt",
686        "**/requirements*.in",
687        "**/requirements/*.txt"
688    ],
689    "pypi",
690    "Python",
691    Some("https://pip.pypa.io/en/latest/reference/requirements-file-format/"),
692);