Skip to main content

provenant/parsers/
requirements_txt.rs

1//! Parser for pip requirements.txt files.
2//!
3//! Extracts Python package dependencies from requirements.txt files using PEP 508
4//! specification parsing with support for includes, environment markers, and URLs.
5//!
6//! # Supported Formats
7//! - requirements.txt (pip dependency specification files)
8//! - Supports includes: `-r requirements.txt`, `-c constraints.txt`
9//! - Supports markers: `package; python_version >= '3.6'`
10//! - Supports VCS refs: `git+https://...`, `git+ssh://...`
11//!
12//! # Key Features
13//! - PEP 508 requirement parsing with environment marker evaluation
14//! - Recursive file inclusion support (`-r` and `-c` directives)
15//! - VCS/URL dependency detection and handling
16//! - Package URL (purl) generation for PyPI packages
17//! - Line comment handling and continuation lines
18//!
19//! # Implementation Notes
20//! - Uses PEP 508 parser from `pep508` module
21//! - Recursively resolves included files relative to parent file
22//! - Comments (lines starting with `#`) are skipped
23//! - Environment markers are preserved for dependency filtering
24
25use std::collections::{HashMap, HashSet};
26use std::fs;
27use std::path::{Path, PathBuf};
28
29use log::warn;
30use packageurl::PackageUrl;
31use serde_json::Value as JsonValue;
32
33use crate::models::{DatasourceId, Dependency, PackageData, PackageType};
34use crate::parsers::pep508::{Pep508Requirement, parse_pep508_requirement};
35
36use super::PackageParser;
37
38/// pip requirements.txt parser supporting PEP 508 dependency specifications.
39///
40/// Handles requirements.txt files with -r/-c includes, environment markers,
41/// and VCS/URL references. Recursively resolves included requirement files.
42pub struct RequirementsTxtParser;
43
44impl PackageParser for RequirementsTxtParser {
45    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
46
47    fn extract_packages(path: &Path) -> Vec<PackageData> {
48        vec![extract_from_requirements_txt(path)]
49    }
50
51    fn is_match(path: &Path) -> bool {
52        let filename = path.file_name().and_then(|name| name.to_str());
53        let parent_name = path
54            .parent()
55            .and_then(|parent| parent.file_name())
56            .and_then(|name| name.to_str());
57
58        if let Some(name) = filename {
59            if name == "requirements.txt" {
60                return true;
61            }
62            if name.starts_with("requirements-") && name.ends_with(".txt") {
63                return true;
64            }
65            if parent_name == Some("requirements") && name.ends_with(".txt") {
66                return true;
67            }
68        }
69
70        false
71    }
72}
73
74struct ParseState {
75    dependencies: Vec<Dependency>,
76    extra_index_urls: Vec<String>,
77    index_url: Option<String>,
78    includes: Vec<String>,
79    constraints: Vec<String>,
80    visited: HashSet<PathBuf>,
81}
82
83fn extract_from_requirements_txt(path: &Path) -> PackageData {
84    let mut state = ParseState {
85        dependencies: Vec::new(),
86        extra_index_urls: Vec::new(),
87        index_url: None,
88        includes: Vec::new(),
89        constraints: Vec::new(),
90        visited: HashSet::new(),
91    };
92
93    let (scope, is_runtime) = scope_from_filename(path);
94
95    parse_requirements_with_includes(path, &mut state, &scope, is_runtime);
96
97    let mut extra_data = HashMap::new();
98    if let Some(url) = state.index_url {
99        extra_data.insert("index_url".to_string(), JsonValue::String(url));
100    }
101    if !state.extra_index_urls.is_empty() {
102        extra_data.insert(
103            "extra_index_urls".to_string(),
104            JsonValue::Array(
105                state
106                    .extra_index_urls
107                    .into_iter()
108                    .map(JsonValue::String)
109                    .collect(),
110            ),
111        );
112    }
113    if !state.includes.is_empty() {
114        extra_data.insert(
115            "requirements_includes".to_string(),
116            JsonValue::Array(state.includes.into_iter().map(JsonValue::String).collect()),
117        );
118    }
119    if !state.constraints.is_empty() {
120        extra_data.insert(
121            "constraints".to_string(),
122            JsonValue::Array(
123                state
124                    .constraints
125                    .into_iter()
126                    .map(JsonValue::String)
127                    .collect(),
128            ),
129        );
130    }
131
132    let extra_data = if extra_data.is_empty() {
133        None
134    } else {
135        Some(extra_data)
136    };
137
138    default_package_data(state.dependencies, extra_data)
139}
140
141fn parse_requirements_with_includes(
142    path: &Path,
143    state: &mut ParseState,
144    scope: &str,
145    is_runtime: bool,
146) {
147    let abs_path = match path.canonicalize() {
148        Ok(p) => p,
149        Err(_) => {
150            warn!("Cannot resolve path: {:?}", path);
151            return;
152        }
153    };
154
155    if state.visited.contains(&abs_path) {
156        warn!("Circular include detected: {:?}", path);
157        return;
158    }
159
160    state.visited.insert(abs_path.clone());
161
162    let content = match fs::read_to_string(&abs_path) {
163        Ok(c) => c,
164        Err(e) => {
165            warn!("Cannot read file {:?}: {}", abs_path, e);
166            return;
167        }
168    };
169
170    for line in collect_logical_lines(&content) {
171        let cleaned = strip_inline_comment(&line);
172        let trimmed = cleaned.trim();
173        if trimmed.is_empty() || trimmed.starts_with('#') {
174            continue;
175        }
176
177        if let Some(url) = parse_option_value(trimmed, "--extra-index-url") {
178            state.extra_index_urls.push(url);
179            continue;
180        }
181
182        if let Some(url) = parse_option_value(trimmed, "--index-url") {
183            state.index_url = Some(url);
184            continue;
185        }
186
187        if let Some(path_value) = parse_option_value(trimmed, "-r")
188            .or_else(|| parse_option_value(trimmed, "--requirement"))
189        {
190            state.includes.push(path_value.clone());
191            let included_path = abs_path
192                .parent()
193                .unwrap_or_else(|| Path::new("."))
194                .join(&path_value);
195
196            if included_path.exists() {
197                parse_requirements_with_includes(&included_path, state, scope, is_runtime);
198            } else {
199                warn!("Included file not found: {:?}", included_path);
200            }
201            continue;
202        }
203
204        if let Some(path_value) = parse_option_value(trimmed, "-c")
205            .or_else(|| parse_option_value(trimmed, "--constraint"))
206        {
207            state.constraints.push(path_value.clone());
208            let constraint_path = abs_path
209                .parent()
210                .unwrap_or_else(|| Path::new("."))
211                .join(&path_value);
212
213            if constraint_path.exists() {
214                parse_requirements_with_includes(&constraint_path, state, scope, is_runtime);
215            } else {
216                warn!("Constraint file not found: {:?}", constraint_path);
217            }
218            continue;
219        }
220
221        if trimmed.starts_with('-')
222            && !trimmed.starts_with("-e")
223            && !trimmed.starts_with("--editable")
224        {
225            continue;
226        }
227
228        if let Some(dependency) = build_dependency(trimmed, scope, is_runtime) {
229            state.dependencies.push(dependency);
230        }
231    }
232}
233
234fn default_package_data(
235    dependencies: Vec<Dependency>,
236    extra_data: Option<HashMap<String, JsonValue>>,
237) -> PackageData {
238    PackageData {
239        package_type: Some(RequirementsTxtParser::PACKAGE_TYPE),
240        primary_language: Some("Python".to_string()),
241        extra_data,
242        dependencies,
243        datasource_id: Some(DatasourceId::PipRequirements),
244        ..Default::default()
245    }
246}
247
248fn collect_logical_lines(content: &str) -> Vec<String> {
249    let mut lines = Vec::new();
250    let mut current = String::new();
251
252    for raw_line in content.lines() {
253        let line = raw_line.trim_end_matches('\r');
254        let trimmed = line.trim_end();
255        let is_continuation = trimmed.ends_with('\\');
256        let line_without = if is_continuation {
257            trimmed.trim_end_matches('\\')
258        } else {
259            line
260        };
261
262        if !line_without.trim().is_empty() {
263            if !current.is_empty() {
264                current.push(' ');
265            }
266            current.push_str(line_without.trim());
267        }
268
269        if !is_continuation && !current.is_empty() {
270            lines.push(current.trim().to_string());
271            current.clear();
272        }
273    }
274
275    if !current.is_empty() {
276        lines.push(current.trim().to_string());
277    }
278
279    lines
280}
281
282fn strip_inline_comment(line: &str) -> String {
283    let mut in_single = false;
284    let mut in_double = false;
285    for (idx, ch) in line.char_indices() {
286        match ch {
287            '\'' if !in_double => in_single = !in_single,
288            '"' if !in_single => in_double = !in_double,
289            '#' if !in_single && !in_double => {
290                let prefix = &line[..idx];
291                if prefix.trim_end().is_empty() || prefix.ends_with(char::is_whitespace) {
292                    return prefix.trim_end().to_string();
293                }
294            }
295            _ => {}
296        }
297    }
298    line.to_string()
299}
300
301fn parse_option_value(line: &str, option: &str) -> Option<String> {
302    let stripped = line.strip_prefix(option)?;
303    let mut rest = stripped.trim();
304    if let Some(rest_stripped) = rest.strip_prefix('=') {
305        rest = rest_stripped.trim();
306    }
307    if rest.is_empty() {
308        None
309    } else {
310        Some(rest.to_string())
311    }
312}
313
314fn scope_from_filename(path: &Path) -> (String, bool) {
315    let filename = path
316        .file_name()
317        .and_then(|name| name.to_str())
318        .unwrap_or_default()
319        .to_ascii_lowercase();
320
321    if filename.contains("dev") {
322        return ("develop".to_string(), false);
323    }
324    if filename.contains("test") {
325        return ("test".to_string(), false);
326    }
327    if filename.contains("doc") {
328        return ("docs".to_string(), false);
329    }
330
331    ("install".to_string(), true)
332}
333
334fn build_dependency(line: &str, scope: &str, is_runtime: bool) -> Option<Dependency> {
335    let trimmed = line.trim();
336    if trimmed.is_empty() {
337        return None;
338    }
339
340    let mut is_editable = false;
341    let mut requirement = trimmed.to_string();
342    let mut extracted_requirement = trimmed.to_string();
343
344    if let Some(rest) = trimmed.strip_prefix("-e") {
345        is_editable = true;
346        requirement = rest.trim().to_string();
347        extracted_requirement = format!("--editable {}", requirement);
348    } else if let Some(rest) = trimmed.strip_prefix("--editable") {
349        is_editable = true;
350        requirement = rest.trim().to_string();
351        extracted_requirement = format!("--editable {}", requirement);
352    }
353
354    let (requirement, hash_options) = split_hash_options(&requirement);
355    let requirement = requirement.trim();
356    if requirement.is_empty() {
357        return None;
358    }
359
360    let parsed = parse_requirement(requirement);
361
362    let pinned_version = parsed
363        .specifiers
364        .as_deref()
365        .and_then(extract_pinned_version);
366    let is_pinned = pinned_version.is_some();
367
368    let purl = parsed
369        .name
370        .as_ref()
371        .and_then(|name| create_pypi_purl(name, pinned_version.as_deref()));
372
373    let mut extra_data = HashMap::new();
374    extra_data.insert("is_editable".to_string(), JsonValue::Bool(is_editable));
375    extra_data.insert(
376        "link".to_string(),
377        parsed
378            .link
379            .clone()
380            .map(JsonValue::String)
381            .unwrap_or(JsonValue::Null),
382    );
383    extra_data.insert(
384        "hash_options".to_string(),
385        JsonValue::Array(hash_options.into_iter().map(JsonValue::String).collect()),
386    );
387    extra_data.insert("is_constraint".to_string(), JsonValue::Bool(false));
388    extra_data.insert(
389        "is_archive".to_string(),
390        parsed
391            .is_archive
392            .map(JsonValue::Bool)
393            .unwrap_or(JsonValue::Null),
394    );
395    extra_data.insert("is_wheel".to_string(), JsonValue::Bool(parsed.is_wheel));
396    extra_data.insert(
397        "is_url".to_string(),
398        parsed
399            .is_url
400            .map(JsonValue::Bool)
401            .unwrap_or(JsonValue::Null),
402    );
403    extra_data.insert(
404        "is_vcs_url".to_string(),
405        parsed
406            .is_vcs_url
407            .map(JsonValue::Bool)
408            .unwrap_or(JsonValue::Null),
409    );
410    extra_data.insert(
411        "is_name_at_url".to_string(),
412        JsonValue::Bool(parsed.is_name_at_url),
413    );
414    extra_data.insert(
415        "is_local_path".to_string(),
416        parsed
417            .is_local_path
418            .map(|value| value || is_editable)
419            .map(JsonValue::Bool)
420            .unwrap_or(JsonValue::Null),
421    );
422
423    if let Some(marker) = parsed.marker {
424        extra_data.insert("markers".to_string(), JsonValue::String(marker));
425    }
426
427    Some(Dependency {
428        purl,
429        extracted_requirement: Some(extracted_requirement),
430        scope: Some(scope.to_string()),
431        is_runtime: Some(is_runtime),
432        is_optional: Some(false),
433        is_pinned: Some(is_pinned),
434        is_direct: Some(true),
435        resolved_package: None,
436        extra_data: Some(extra_data),
437    })
438}
439
440fn split_hash_options(input: &str) -> (String, Vec<String>) {
441    let mut filtered = Vec::new();
442    let mut hashes = Vec::new();
443
444    for token in input.split_whitespace() {
445        if let Some(value) = token.strip_prefix("--hash=") {
446            if !value.is_empty() {
447                hashes.push(value.to_string());
448            }
449        } else {
450            filtered.push(token);
451        }
452    }
453
454    (filtered.join(" "), hashes)
455}
456
457struct ParsedRequirement {
458    name: Option<String>,
459    specifiers: Option<String>,
460    marker: Option<String>,
461    link: Option<String>,
462    is_url: Option<bool>,
463    is_vcs_url: Option<bool>,
464    is_local_path: Option<bool>,
465    is_name_at_url: bool,
466    is_archive: Option<bool>,
467    is_wheel: bool,
468}
469
470fn parse_requirement(input: &str) -> ParsedRequirement {
471    if let Some(parsed) = parse_pep508_requirement(input) {
472        if let Some(url) = parsed.url.clone() {
473            return parsed_with_link(parsed, &url);
474        }
475
476        if !is_link_like(input) {
477            let name = Some(normalize_pypi_name(&parsed.name));
478            return ParsedRequirement {
479                name,
480                specifiers: parsed.specifiers,
481                marker: parsed.marker,
482                link: None,
483                is_url: None,
484                is_vcs_url: None,
485                is_local_path: None,
486                is_name_at_url: false,
487                is_archive: None,
488                is_wheel: false,
489            };
490        }
491    }
492
493    if let Some((name, link)) = parse_link_with_name(input) {
494        let normalized_name = normalize_pypi_name(&name);
495        let link_info = parse_link_flags(&link);
496        return ParsedRequirement {
497            name: Some(normalized_name),
498            specifiers: None,
499            marker: None,
500            link: Some(link),
501            is_url: Some(link_info.is_url),
502            is_vcs_url: Some(link_info.is_vcs_url),
503            is_local_path: Some(link_info.is_local_path),
504            is_name_at_url: link_info.is_name_at_url,
505            is_archive: link_info.is_archive,
506            is_wheel: link_info.is_wheel,
507        };
508    }
509
510    let link_info = parse_link_flags(input);
511    ParsedRequirement {
512        name: None,
513        specifiers: None,
514        marker: None,
515        link: Some(input.to_string()),
516        is_url: Some(link_info.is_url),
517        is_vcs_url: Some(link_info.is_vcs_url),
518        is_local_path: Some(link_info.is_local_path),
519        is_name_at_url: link_info.is_name_at_url,
520        is_archive: link_info.is_archive,
521        is_wheel: link_info.is_wheel,
522    }
523}
524
525fn parsed_with_link(parsed: Pep508Requirement, link: &str) -> ParsedRequirement {
526    let name = normalize_pypi_name(&parsed.name);
527    let link_info = parse_link_flags(link);
528    ParsedRequirement {
529        name: Some(name),
530        specifiers: parsed.specifiers,
531        marker: parsed.marker,
532        link: Some(link.to_string()),
533        is_url: Some(link_info.is_url),
534        is_vcs_url: Some(link_info.is_vcs_url),
535        is_local_path: Some(link_info.is_local_path),
536        is_name_at_url: parsed.is_name_at_url,
537        is_archive: link_info.is_archive,
538        is_wheel: link_info.is_wheel,
539    }
540}
541
542fn parse_link_with_name(input: &str) -> Option<(String, String)> {
543    if let Some(egg) = extract_egg_name(input) {
544        return Some((egg, input.to_string()));
545    }
546    None
547}
548
549fn extract_egg_name(input: &str) -> Option<String> {
550    let fragment = input.split('#').nth(1)?;
551    let egg_part = fragment.strip_prefix("egg=")?;
552    let name_part = egg_part.split('&').next()?.trim();
553    if name_part.is_empty() {
554        return None;
555    }
556    let (name, _extras, _) = parse_pep508_requirement(name_part)
557        .map(|parsed| (parsed.name, parsed.extras, parsed.specifiers))
558        .unwrap_or_else(|| (name_part.to_string(), Vec::new(), None));
559    Some(name)
560}
561
562struct LinkFlags {
563    is_url: bool,
564    is_vcs_url: bool,
565    is_local_path: bool,
566    is_name_at_url: bool,
567    is_archive: Option<bool>,
568    is_wheel: bool,
569}
570
571fn parse_link_flags(link: &str) -> LinkFlags {
572    let trimmed = link.trim();
573    let is_vcs_url = trimmed.starts_with("git+")
574        || trimmed.starts_with("hg+")
575        || trimmed.starts_with("svn+")
576        || trimmed.starts_with("bzr+");
577    let has_scheme = trimmed.contains("://") || trimmed.starts_with("file:");
578    let is_local_path = trimmed.starts_with("./")
579        || trimmed.starts_with("../")
580        || trimmed.starts_with('/')
581        || trimmed.starts_with('~')
582        || trimmed.starts_with("file:");
583
584    let is_wheel = trimmed.ends_with(".whl");
585    let is_archive = if is_wheel
586        || trimmed.ends_with(".zip")
587        || trimmed.ends_with(".tar.gz")
588        || trimmed.ends_with(".tgz")
589        || trimmed.ends_with(".tar.bz2")
590        || trimmed.ends_with(".tar")
591    {
592        Some(true)
593    } else if has_scheme || is_local_path {
594        Some(false)
595    } else {
596        None
597    };
598
599    LinkFlags {
600        is_url: has_scheme || is_vcs_url,
601        is_vcs_url,
602        is_local_path,
603        is_name_at_url: false,
604        is_archive,
605        is_wheel,
606    }
607}
608
609fn is_link_like(input: &str) -> bool {
610    let trimmed = input.trim();
611    trimmed.starts_with("git+")
612        || trimmed.starts_with("hg+")
613        || trimmed.starts_with("svn+")
614        || trimmed.starts_with("bzr+")
615        || trimmed.starts_with("file:")
616        || trimmed.contains("://")
617        || trimmed.starts_with("./")
618        || trimmed.starts_with("../")
619        || trimmed.starts_with('/')
620        || trimmed.starts_with('~')
621}
622
623fn extract_pinned_version(specifiers: &str) -> Option<String> {
624    let trimmed = specifiers.trim();
625    if trimmed.contains(',') {
626        return None;
627    }
628
629    let stripped = if let Some(version) = trimmed.strip_prefix("==") {
630        version
631    } else if let Some(version) = trimmed.strip_prefix("===") {
632        version
633    } else {
634        return None;
635    };
636
637    let version = stripped.trim();
638    if version.is_empty() || version.contains('*') {
639        None
640    } else {
641        Some(version.to_string())
642    }
643}
644
645fn create_pypi_purl(name: &str, version: Option<&str>) -> Option<String> {
646    let mut purl = PackageUrl::new(RequirementsTxtParser::PACKAGE_TYPE.as_str(), name).ok()?;
647    if let Some(version) = version {
648        purl.with_version(version).ok()?;
649    }
650    Some(purl.to_string())
651}
652
653fn normalize_pypi_name(name: &str) -> String {
654    let lower = name.trim().to_ascii_lowercase();
655    let mut normalized = String::new();
656    let mut last_was_sep = false;
657    for ch in lower.chars() {
658        let is_sep = matches!(ch, '-' | '_' | '.');
659        if is_sep {
660            if !last_was_sep {
661                normalized.push('-');
662                last_was_sep = true;
663            }
664        } else {
665            normalized.push(ch);
666            last_was_sep = false;
667        }
668    }
669    normalized
670}
671
672crate::register_parser!(
673    "pip requirements file",
674    &[
675        "**/requirements*.txt",
676        "**/requirements*.in",
677        "**/requirements/*.txt"
678    ],
679    "pypi",
680    "Python",
681    Some("https://pip.pypa.io/en/latest/reference/requirements-file-format/"),
682);