keyhog_core/source.rs
1//! Source trait and chunk types: the abstraction for pluggable input backends.
2
3// Debt bucket: 9 items predating the crate floor raising `missing_docs` to
4// `warn`. Remove this allow once every Source-trait item is documented.
5#![allow(missing_docs)]
6
7use crate::SensitiveString;
8use serde::Serialize;
9use thiserror::Error;
10
11/// A scannable chunk of text with metadata about where it came from.
12///
13/// # Examples
14///
15/// ```rust
16/// use keyhog_core::{Chunk, ChunkMetadata};
17///
18/// let chunk = Chunk {
19/// data: "API_KEY=sk_live_example".into(),
20/// metadata: ChunkMetadata {
21/// source_type: "filesystem".into(),
22/// path: Some("app.env".into()),
23/// ..Default::default()
24/// },
25/// };
26///
27/// assert_eq!(chunk.metadata.path.as_deref(), Some("app.env"));
28/// ```
29#[derive(Debug, Clone, Serialize)]
30pub struct Chunk {
31 /// UTF-8 text content to scan.
32 pub data: SensitiveString,
33 /// Provenance details used in findings and reporters.
34 pub metadata: ChunkMetadata,
35}
36
37impl From<String> for Chunk {
38 fn from(data: String) -> Self {
39 Self {
40 data: data.into(),
41 metadata: ChunkMetadata::default(),
42 }
43 }
44}
45
46impl From<&str> for Chunk {
47 fn from(data: &str) -> Self {
48 Self::from(data.to_string())
49 }
50}
51
52/// Metadata that tracks the source location for a scanned chunk.
53///
54/// # Examples
55///
56/// ```rust
57/// use keyhog_core::ChunkMetadata;
58///
59/// let metadata = ChunkMetadata {
60/// source_type: "git-diff".into(),
61/// path: Some("src/lib.rs".into()),
62/// commit: Some("abc123".into()),
63/// author: Some("Dev".into()),
64/// date: Some("2026-03-26T00:00:00Z".into()),
65/// ..Default::default()
66/// };
67///
68/// assert_eq!(metadata.source_type, "git-diff");
69/// ```
70#[derive(Debug, Clone, Serialize, Default)]
71pub struct ChunkMetadata {
72 pub source_type: String,
73 pub path: Option<String>,
74 pub commit: Option<String>,
75 pub author: Option<String>,
76 pub date: Option<String>,
77 pub base_offset: usize,
78 /// Number of lines that precede `base_offset` in the original file -
79 /// the line-number analog of `base_offset`. Zero for whole-file chunks
80 /// (single-pass mmap, stdin, http, git diffs). Non-zero only when a
81 /// source slices one file into multiple chunks (the filesystem
82 /// `>window_size` windowed path), where each window after the first
83 /// starts partway through the file. The scanner computes a match's
84 /// line number *within the chunk text* and adds this base so the
85 /// reported line is the absolute file line, not the per-window one -
86 /// exactly mirroring how `base_offset` makes the byte offset absolute.
87 /// Without it, a secret on line 584307 of a 70 MiB file was reported
88 /// at the window-local line (e.g. line 2), making findings impossible
89 /// to locate.
90 #[serde(default)]
91 pub base_line: usize,
92 /// File mtime in nanoseconds since UNIX epoch, when the source can
93 /// surface it cheaply (filesystem walks). Optional because non-fs
94 /// sources (stdin, http, git diffs) don't have a meaningful mtime.
95 /// Populated to drive the merkle-index metadata fast-path.
96 #[serde(default, skip_serializing_if = "Option::is_none")]
97 pub mtime_ns: Option<u64>,
98 /// File size in bytes, when known cheaply at chunk-production time.
99 /// Same shape and rationale as `mtime_ns`.
100 #[serde(default, skip_serializing_if = "Option::is_none")]
101 pub size_bytes: Option<u64>,
102}
103
104/// Produces chunks of text for the scanner to process.
105/// Each implementation handles a different input source.
106///
107/// # Examples
108///
109/// ```rust
110/// use keyhog_core::{Chunk, ChunkMetadata, Source, SourceError};
111///
112/// struct StaticSource;
113///
114/// impl Source for StaticSource {
115/// fn name(&self) -> &str {
116/// "static"
117/// }
118///
119/// fn chunks(&self) -> Box<dyn Iterator<Item = Result<Chunk, SourceError>> + '_> {
120/// Box::new(std::iter::once(Ok(Chunk {
121/// data: "TOKEN=value".into(),
122/// metadata: ChunkMetadata {
123/// source_type: "static".into(),
124/// ..Default::default()
125/// },
126/// })))
127/// }
128///
129/// fn as_any(&self) -> &dyn std::any::Any {
130/// self
131/// }
132/// }
133///
134/// let source = StaticSource;
135/// assert_eq!(source.name(), "static");
136/// ```
137pub trait Source: Send + Sync {
138 /// Human-readable source name used in warnings and telemetry.
139 fn name(&self) -> &str;
140 /// Yield all readable chunks from this source.
141 fn chunks(&self) -> Box<dyn Iterator<Item = Result<Chunk, SourceError>> + '_>;
142 /// Support downcasting to concrete types.
143 fn as_any(&self) -> &dyn std::any::Any;
144}
145
146/// Errors returned by input sources while enumerating or reading content.
147///
148/// # Examples
149///
150/// ```rust
151/// use keyhog_core::SourceError;
152///
153/// let error = SourceError::Other("pass a readable file or directory".into());
154/// assert!(error.to_string().contains("Fix"));
155/// ```
156#[derive(Debug, Error)]
157pub enum SourceError {
158 #[error(
159 "failed to read source: {0}. Fix: check the path exists, is readable, and is not a broken symlink"
160 )]
161 Io(#[from] std::io::Error),
162 #[error(
163 "failed to access git source: {0}. Fix: run inside a valid git repository and verify the requested refs exist"
164 )]
165 Git(String),
166 #[error(
167 "failed to read source: {0}. Fix: adjust the source settings or input so KeyHog can read plain text safely"
168 )]
169 Other(String),
170}