Skip to main content

keyhog_core/
source.rs

1//! Source trait and chunk types: the abstraction for pluggable input backends.
2
3// Debt bucket: 9 items predating the crate floor raising `missing_docs` to
4// `warn`. Remove this allow once every Source-trait item is documented.
5#![allow(missing_docs)]
6
7use crate::SensitiveString;
8use serde::Serialize;
9use thiserror::Error;
10
11/// A scannable chunk of text with metadata about where it came from.
12///
13/// # Examples
14///
15/// ```rust
16/// use keyhog_core::{Chunk, ChunkMetadata};
17///
18/// let chunk = Chunk {
19///     data: "API_KEY=sk_live_example".into(),
20///     metadata: ChunkMetadata {
21///         source_type: "filesystem".into(),
22///         path: Some("app.env".into()),
23///         ..Default::default()
24///     },
25/// };
26///
27/// assert_eq!(chunk.metadata.path.as_deref(), Some("app.env"));
28/// ```
29#[derive(Debug, Clone, Serialize)]
30pub struct Chunk {
31    /// UTF-8 text content to scan.
32    pub data: SensitiveString,
33    /// Provenance details used in findings and reporters.
34    pub metadata: ChunkMetadata,
35}
36
37impl From<String> for Chunk {
38    fn from(data: String) -> Self {
39        Self {
40            data: data.into(),
41            metadata: ChunkMetadata::default(),
42        }
43    }
44}
45
46impl From<&str> for Chunk {
47    fn from(data: &str) -> Self {
48        Self::from(data.to_string())
49    }
50}
51
52/// Metadata that tracks the source location for a scanned chunk.
53///
54/// # Examples
55///
56/// ```rust
57/// use keyhog_core::ChunkMetadata;
58///
59/// let metadata = ChunkMetadata {
60///     source_type: "git-diff".into(),
61///     path: Some("src/lib.rs".into()),
62///     commit: Some("abc123".into()),
63///     author: Some("Dev".into()),
64///     date: Some("2026-03-26T00:00:00Z".into()),
65///     ..Default::default()
66/// };
67///
68/// assert_eq!(metadata.source_type, "git-diff");
69/// ```
70#[derive(Debug, Clone, Serialize, Default)]
71pub struct ChunkMetadata {
72    pub source_type: String,
73    pub path: Option<String>,
74    pub commit: Option<String>,
75    pub author: Option<String>,
76    pub date: Option<String>,
77    pub base_offset: usize,
78    /// Number of lines that precede `base_offset` in the original file -
79    /// the line-number analog of `base_offset`. Zero for whole-file chunks
80    /// (single-pass mmap, stdin, http, git diffs). Non-zero only when a
81    /// source slices one file into multiple chunks (the filesystem
82    /// `>window_size` windowed path), where each window after the first
83    /// starts partway through the file. The scanner computes a match's
84    /// line number *within the chunk text* and adds this base so the
85    /// reported line is the absolute file line, not the per-window one -
86    /// exactly mirroring how `base_offset` makes the byte offset absolute.
87    /// Without it, a secret on line 584307 of a 70 MiB file was reported
88    /// at the window-local line (e.g. line 2), making findings impossible
89    /// to locate.
90    #[serde(default)]
91    pub base_line: usize,
92    /// File mtime in nanoseconds since UNIX epoch, when the source can
93    /// surface it cheaply (filesystem walks). Optional because non-fs
94    /// sources (stdin, http, git diffs) don't have a meaningful mtime.
95    /// Populated to drive the merkle-index metadata fast-path.
96    #[serde(default, skip_serializing_if = "Option::is_none")]
97    pub mtime_ns: Option<u64>,
98    /// File size in bytes, when known cheaply at chunk-production time.
99    /// Same shape and rationale as `mtime_ns`.
100    #[serde(default, skip_serializing_if = "Option::is_none")]
101    pub size_bytes: Option<u64>,
102}
103
104/// Produces chunks of text for the scanner to process.
105/// Each implementation handles a different input source.
106///
107/// # Examples
108///
109/// ```rust
110/// use keyhog_core::{Chunk, ChunkMetadata, Source, SourceError};
111///
112/// struct StaticSource;
113///
114/// impl Source for StaticSource {
115///     fn name(&self) -> &str {
116///         "static"
117///     }
118///
119///     fn chunks(&self) -> Box<dyn Iterator<Item = Result<Chunk, SourceError>> + '_> {
120///         Box::new(std::iter::once(Ok(Chunk {
121///             data: "TOKEN=value".into(),
122///             metadata: ChunkMetadata {
123///                 source_type: "static".into(),
124///                 ..Default::default()
125///             },
126///         })))
127///     }
128///
129///     fn as_any(&self) -> &dyn std::any::Any {
130///         self
131///     }
132/// }
133///
134/// let source = StaticSource;
135/// assert_eq!(source.name(), "static");
136/// ```
137pub trait Source: Send + Sync {
138    /// Human-readable source name used in warnings and telemetry.
139    fn name(&self) -> &str;
140    /// Yield all readable chunks from this source.
141    fn chunks(&self) -> Box<dyn Iterator<Item = Result<Chunk, SourceError>> + '_>;
142    /// Support downcasting to concrete types.
143    fn as_any(&self) -> &dyn std::any::Any;
144}
145
146/// Errors returned by input sources while enumerating or reading content.
147///
148/// # Examples
149///
150/// ```rust
151/// use keyhog_core::SourceError;
152///
153/// let error = SourceError::Other("pass a readable file or directory".into());
154/// assert!(error.to_string().contains("Fix"));
155/// ```
156#[derive(Debug, Error)]
157pub enum SourceError {
158    #[error(
159        "failed to read source: {0}. Fix: check the path exists, is readable, and is not a broken symlink"
160    )]
161    Io(#[from] std::io::Error),
162    #[error(
163        "failed to access git source: {0}. Fix: run inside a valid git repository and verify the requested refs exist"
164    )]
165    Git(String),
166    #[error(
167        "failed to read source: {0}. Fix: adjust the source settings or input so KeyHog can read plain text safely"
168    )]
169    Other(String),
170}