repotoire 0.8.0

Graph-powered code analysis CLI. 110 detectors for security, architecture, bus factor, and code quality.
Documentation
//! Shared parser for `# repotoire: <kind>[<args>]` source-level annotations.
//!
//! This module is the consolidation point for the per-detector annotation
//! parsers introduced in Phase 2a (`insecure_crypto/annotation.rs`) and
//! Phase 2b (`path_traversal/annotation.rs`). Phase 2c is the third
//! detector that needs the same parser — the rule-of-three trigger for
//! extraction.
//!
//! # Why a shared module now
//!
//! Both Phase 2a and 2b annotation files were byte-identical parser
//! bodies differing only in:
//!
//! - Docstring examples (per-kind illustrative annotations)
//! - Test data (per-kind canonical inputs)
//! - Pub visibility (`pub(super)` vs `pub` — accidental drift)
//!
//! Phase 2c brings a third detector (`tls-disabled` kind). Three is the
//! point at which copying becomes more expensive than abstracting, per
//! the explicit deferral in the Phase 2b annotation docstring:
//!
//! > "Two copies is the right number for the *first* opportunity to
//! > consolidate; consolidating from one detector is premature. Phase 2c
//! > (InsecureTls — the third detector) is the right time to extract a
//! > shared `dual_branch_annotation.rs` and have all per-detector
//! > predictors register kind names against it."
//!
//! # API contract
//!
//! [`parse_python_comment`] is **kind-agnostic** — it parses any
//! `# repotoire: <kind>[<args>]` comment into an [`Annotation`] without
//! validating the kind. Per-detector wrappers decide which kinds are
//! meaningful for that detector and how to interpret their args.
//!
//! Forward-compatibility convention: unknown kinds are returned as-is
//! from this parser (the caller may ignore them). This matches `# noqa`
//! — codes the linter doesn't recognize are silently tolerated.
//!
//! # Syntax (preserved verbatim from the Phase 2a/2b parsers)
//!
//! ```text
//! # repotoire: <kind>[<comma-separated-args>]
//! ```
//!
//! Whitespace around `#`, `:`, and inside brackets is tolerant; the
//! parser normalizes by stripping. All four of these parse identically
//! into the same [`Annotation`]:
//!
//! ```text
//! # repotoire: tls-disabled[self-signed-dev-cert]
//! #repotoire:tls-disabled[self-signed-dev-cert]
//! # repotoire : tls-disabled [ self-signed-dev-cert ]
//! #  repotoire:  tls-disabled [ self-signed-dev-cert ]
//! ```
//!
//! Args are comma-separated; empty args inside the list are dropped
//! (`[a,,b]` → `["a", "b"]`). The args list is optional.
//!
//! # What this parser does NOT do
//!
//! - Does not validate kind names (caller registers what's meaningful).
//! - Does not validate arg semantics.
//! - Does not handle multiple annotations on one line.
//! - Does not look for annotations on adjacent lines (annotations are
//!   end-of-line on the call site, matching `# noqa` / `# nosec` /
//!   `# type: ignore`).
//! - Does not look at `//` (C-family) or `/* ... */` comments — Python-
//!   only for now. Add a separate parse function when the first
//!   non-Python detector needs annotations.

/// A parsed `# repotoire: ...` annotation.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Annotation {
    /// The kind, kebab-case, preserving the source case (`Internal-Path`
    /// parses as kind `"Internal-Path"`, not normalized). Callers that
    /// want case-insensitive matching should `.to_ascii_lowercase()`.
    pub kind: String,
    /// Optional bracketed args. Empty `Vec` if no brackets were given.
    pub args: Vec<String>,
}

/// Parse a single Python-style comment line into an [`Annotation`], if
/// it is a `# repotoire: <kind>[<args>]` annotation.
///
/// Returns `None` when:
/// - the comment doesn't start with `repotoire:` (after `#` and any
///   whitespace; case-insensitive on `repotoire`),
/// - the kind is missing or empty,
/// - the bracket structure is malformed (unbalanced brackets).
///
/// `line` may be the bare comment (`"# repotoire: ..."`) or a full
/// source line containing the comment after the call
/// (`"open(p)  # repotoire: ..."`); the parser searches for the first
/// `#` and parses from there.
pub fn parse_python_comment(line: &str) -> Option<Annotation> {
    let hash_pos = line.find('#')?;
    let after_hash = &line[hash_pos + 1..];
    let trimmed = after_hash.trim_start();
    let lower = trimmed.to_ascii_lowercase();
    let after_keyword = lower.strip_prefix("repotoire")?;
    let after_keyword = after_keyword.trim_start();
    let body_lower = after_keyword.strip_prefix(':')?;
    let consumed = lower.len() - body_lower.len();
    let body = &trimmed[consumed..];
    let body = body.trim_start();

    if let Some(bracket_pos) = body.find('[') {
        let kind = body[..bracket_pos].trim();
        if kind.is_empty() {
            return None;
        }
        let after_bracket = &body[bracket_pos + 1..];
        let close_pos = after_bracket.find(']')?;
        let args_str = &after_bracket[..close_pos];
        let args: Vec<String> = if args_str.trim().is_empty() {
            Vec::new()
        } else {
            args_str
                .split(',')
                .map(|s| s.trim().to_string())
                .filter(|s| !s.is_empty())
                .collect()
        };
        Some(Annotation {
            kind: kind.to_string(),
            args,
        })
    } else {
        let kind = body.split_whitespace().next()?.to_string();
        if kind.is_empty() {
            None
        } else {
            Some(Annotation {
                kind,
                args: Vec::new(),
            })
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn ann(kind: &str, args: &[&str]) -> Annotation {
        Annotation {
            kind: kind.to_string(),
            args: args.iter().map(|s| s.to_string()).collect(),
        }
    }

    // ── Canonical kind names from each Phase 2 detector. Pinning all
    //    three here ensures the parser stays kind-agnostic — if a
    //    Phase 2c-specific change accidentally couples this parser to
    //    a single kind, these tests will fail. ──

    #[test]
    fn canonical_phase2a_protocol_required_parses() {
        assert_eq!(
            parse_python_comment("# repotoire: protocol-required[RFC7616]"),
            Some(ann("protocol-required", &["RFC7616"])),
        );
    }

    #[test]
    fn canonical_phase2b_internal_path_parses() {
        assert_eq!(
            parse_python_comment("# repotoire: internal-path[validated-by-caller]"),
            Some(ann("internal-path", &["validated-by-caller"])),
        );
    }

    #[test]
    fn canonical_phase2b_user_controlled_parses() {
        assert_eq!(
            parse_python_comment("# repotoire: user-controlled[GET-request]"),
            Some(ann("user-controlled", &["GET-request"])),
        );
    }

    #[test]
    fn canonical_phase2c_tls_disabled_parses() {
        assert_eq!(
            parse_python_comment("# repotoire: tls-disabled[self-signed-dev-cert]"),
            Some(ann("tls-disabled", &["self-signed-dev-cert"])),
        );
    }

    // ── Whitespace tolerance ──

    #[test]
    fn whitespace_variants_parse_identically() {
        let want = Some(ann("tls-disabled", &["dev"]));
        assert_eq!(parse_python_comment("# repotoire: tls-disabled[dev]"), want,);
        assert_eq!(parse_python_comment("#repotoire:tls-disabled[dev]"), want);
        assert_eq!(
            parse_python_comment("#  repotoire:  tls-disabled [ dev ]"),
            want,
        );
        assert_eq!(
            parse_python_comment("# repotoire : tls-disabled[ dev ]"),
            want,
        );
    }

    #[test]
    fn case_insensitive_keyword_only() {
        // `repotoire` is matched case-insensitively (so `Repotoire`
        // and `REPOTOIRE` work) but the kind itself is case-preserved.
        assert_eq!(
            parse_python_comment("# Repotoire: tls-disabled"),
            Some(ann("tls-disabled", &[])),
        );
        assert_eq!(
            parse_python_comment("# REPOTOIRE: tls-disabled"),
            Some(ann("tls-disabled", &[])),
        );
    }

    #[test]
    fn case_preserved_in_kind_and_args() {
        let parsed =
            parse_python_comment("# repotoire: TLS-Disabled[Self-Signed]").expect("should parse");
        assert_eq!(parsed.kind, "TLS-Disabled");
        assert_eq!(parsed.args, vec!["Self-Signed"]);
    }

    // ── Args parsing ──

    #[test]
    fn no_brackets_means_no_args() {
        assert_eq!(
            parse_python_comment("# repotoire: tls-disabled"),
            Some(ann("tls-disabled", &[])),
        );
    }

    #[test]
    fn empty_brackets_means_empty_args() {
        assert_eq!(
            parse_python_comment("# repotoire: tls-disabled[]"),
            Some(ann("tls-disabled", &[])),
        );
    }

    #[test]
    fn multiple_args_split_on_commas() {
        assert_eq!(
            parse_python_comment("# repotoire: user-controlled[GET, POST, body]"),
            Some(ann("user-controlled", &["GET", "POST", "body"])),
        );
    }

    #[test]
    fn empty_args_in_list_are_dropped() {
        assert_eq!(
            parse_python_comment("# repotoire: kind[a,,b]"),
            Some(ann("kind", &["a", "b"])),
        );
    }

    // ── In-line on a real source line ──

    #[test]
    fn end_of_line_after_call_parses() {
        assert_eq!(
            parse_python_comment(
                "    return requests.get(url, verify=False)  # repotoire: tls-disabled[trusted-dev]"
            ),
            Some(ann("tls-disabled", &["trusted-dev"])),
        );
    }

    // ── Non-annotation comments ──

    #[test]
    fn unrelated_comment_returns_none() {
        assert_eq!(parse_python_comment("# TODO: fix this"), None);
        assert_eq!(parse_python_comment("# noqa: E501"), None);
        assert_eq!(parse_python_comment("# nosec"), None);
        assert_eq!(parse_python_comment("# type: ignore"), None);
        assert_eq!(parse_python_comment("open(p)"), None);
    }

    #[test]
    fn empty_kind_returns_none() {
        assert_eq!(parse_python_comment("# repotoire:"), None);
        assert_eq!(parse_python_comment("# repotoire: "), None);
        assert_eq!(parse_python_comment("# repotoire: [arg]"), None);
    }

    #[test]
    fn unbalanced_bracket_returns_none() {
        assert_eq!(
            parse_python_comment("# repotoire: tls-disabled[trusted"),
            None,
        );
    }

    // ── Forward compatibility ──

    #[test]
    fn unknown_kind_still_parses_forward_compat() {
        // The parser is kind-agnostic. Per-detector wrappers decide
        // which kinds they care about; unknown kinds reach the caller
        // unchanged so future detectors don't require a parser change.
        assert_eq!(
            parse_python_comment("# repotoire: future-kind-not-yet-shipped[arg1]"),
            Some(ann("future-kind-not-yet-shipped", &["arg1"])),
        );
    }

    #[test]
    fn trailing_text_after_no_args_kind_stops_at_whitespace() {
        assert_eq!(
            parse_python_comment("# repotoire: tls-disabled some explanation"),
            Some(ann("tls-disabled", &[])),
        );
    }

    // ── Edge cases ──

    #[test]
    fn empty_string_returns_none() {
        assert_eq!(parse_python_comment(""), None);
    }

    #[test]
    fn line_without_any_comment_returns_none() {
        assert_eq!(parse_python_comment("open(p)"), None);
        assert_eq!(parse_python_comment("    "), None);
    }
}