1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
use serde::Serialize;
use std::io::{self, Write};
use std::ops::Range;
/// A tag representing an IP address found in text.
#[derive(Clone, Debug, Serialize)]
pub struct Tag {
/// The raw matched text from the haystack (may include defang brackets).
matched: String,
/// The clean IP address (defang brackets stripped).
#[serde(rename = "value")]
ip: String,
/// The range in the original text where the IP was found.
#[serde(skip_serializing_if = "Option::is_none")]
range: Option<Range<usize>>,
/// The decorated IP with geolocation information.
#[serde(skip_serializing_if = "Option::is_none")]
decorated: Option<String>,
}
impl Tag {
/// Create a new tag for an IP address.
///
/// `matched` is the raw text as it appeared in the haystack (may contain
/// defang brackets). `ip` is the clean, refanged IP address.
#[inline]
pub fn new<S: Into<String>, T: Into<String>>(matched: S, ip: T) -> Tag {
Tag {
matched: matched.into(),
ip: ip.into(),
range: None,
decorated: None,
}
}
/// Set the byte range [start, end) where this tag was found in the original text.
#[inline]
#[must_use]
pub fn with_range(mut self, range: Range<usize>) -> Self {
self.range = Some(range);
self
}
/// Set a "decorated" version of the IP (e.g., with geolocation metadata).
///
/// This string will be used instead of the original IP when calling `Tagged::write`.
#[inline]
pub fn with_decoration<S: Into<String>>(mut self, decorated: S) -> Self {
self.decorated = Some(decorated.into());
self
}
/// Get the clean IP address text.
#[inline]
#[must_use]
pub fn ip(&self) -> &str {
&self.ip
}
/// Get the raw matched text from the haystack.
#[inline]
#[must_use]
pub fn matched(&self) -> &str {
&self.matched
}
/// Get the range of this tag in the original text, if available.
#[inline]
#[must_use]
pub fn range(&self) -> Option<&Range<usize>> {
self.range.as_ref()
}
/// Get the decorated version of this IP, if available.
#[inline]
#[must_use]
pub fn decorated(&self) -> Option<&str> {
self.decorated.as_deref()
}
}
/// A line of text with tags.
#[derive(Clone, Debug, Serialize)]
pub struct Tagged {
/// The original text.
#[serde(skip_serializing)]
text: Vec<u8>,
/// The tags found in the text.
tags: Vec<Tag>,
/// The original text as a string (for JSON serialization).
#[serde(rename = "data")]
text_data: Option<TextData>,
}
/// Represents the text data for JSON serialization.
#[derive(Clone, Debug, Serialize)]
pub struct TextData {
/// The original text as a string.
pub text: String,
}
impl Tagged {
/// Create a new `Tagged` container for a slice of text.
///
/// This container holds the original text and will collect any `Tag`s found within it.
#[inline]
#[must_use]
pub fn new(text: &[u8]) -> Tagged {
// Pre-allocate a reasonable capacity for tags based on text length
let capacity = if text.len() > 1000 { 16 } else { 4 };
Tagged {
text: text.to_vec(),
tags: Vec::with_capacity(capacity), // Most lines have few IPs
text_data: None,
}
}
/// Adds a tag to this text.
///
/// The tag should contain a range that corresponds to its position in `self.text()`.
#[inline]
#[must_use]
pub fn tag(mut self, tag: Tag) -> Self {
self.tags.push(tag);
self
}
/// Get the tags in this text.
#[inline]
#[must_use]
pub fn tags(&self) -> &[Tag] {
&self.tags
}
/// Get the original text.
#[inline]
#[must_use]
pub fn text(&self) -> &[u8] {
&self.text
}
/// Explicitly sets the text data used for JSON serialization.
#[inline]
pub fn set_text_data(&mut self, data: TextData) {
self.text_data = Some(data);
}
/// Writes the text to the given writer, replacing tagged IPs with their decorated versions.
///
/// If a tag has a `decorated()` value, that value is written instead of the original
/// bytes in its `range()`. If no decoration is present, the original bytes are written.
///
/// Tags MUST be sorted by their start position for this to work correctly.
#[inline]
pub fn write<W: Write>(&self, wtr: &mut W) -> io::Result<()> {
// Fast path for no tags
if self.tags.is_empty() {
return wtr.write_all(&self.text);
}
// If we have only one tag (common case), optimize for it
if self.tags.len() == 1 {
let tag = &self.tags[0];
if let Some(range) = tag.range() {
// Write the text before the tag
wtr.write_all(&self.text[..range.start])?;
// Write the decorated version if available, or the original IP
if let Some(decorated) = tag.decorated() {
wtr.write_all(decorated.as_bytes())?;
} else {
wtr.write_all(&self.text[range.clone()])?;
}
// Write the text after the tag
wtr.write_all(&self.text[range.end..])?;
return Ok(());
}
}
// If we have 2 tags (another common case), optimize for it
if self.tags.len() == 2 {
// Get the two tags
let mut tag1 = &self.tags[0];
let mut tag2 = &self.tags[1];
// Ensure tag1 comes before tag2
if let (Some(range1), Some(range2)) = (tag1.range(), tag2.range()) {
if range1.start > range2.start {
std::mem::swap(&mut tag1, &mut tag2);
}
// Write in three parts: before tag1, tag1, between tags, tag2, after tag2
wtr.write_all(&self.text[..range1.start])?;
if let Some(decorated) = tag1.decorated() {
wtr.write_all(decorated.as_bytes())?;
} else {
wtr.write_all(&self.text[range1.clone()])?;
}
wtr.write_all(&self.text[range1.end..range2.start])?;
if let Some(decorated) = tag2.decorated() {
wtr.write_all(decorated.as_bytes())?;
} else {
wtr.write_all(&self.text[range2.clone()])?;
}
wtr.write_all(&self.text[range2.end..])?;
return Ok(());
}
}
// For multiple tags, process them in order
// Tags should always be sorted by position since the extractor finds matches left-to-right
#[cfg(debug_assertions)]
{
for i in 1..self.tags.len() {
if let (Some(prev), Some(curr)) = (self.tags[i - 1].range(), self.tags[i].range()) {
debug_assert!(prev.start <= curr.start, "Tags must be sorted by position");
}
}
}
let mut last_end = 0;
for tag in &self.tags {
if let Some(range) = tag.range() {
// Write the text between the previous tag and this one
wtr.write_all(&self.text[last_end..range.start])?;
// Write the decorated version if available, or the original IP
if let Some(decorated) = tag.decorated() {
wtr.write_all(decorated.as_bytes())?;
} else {
wtr.write_all(&self.text[range.clone()])?;
}
last_end = range.end;
}
}
// Write any remaining text
if last_end < self.text.len() {
wtr.write_all(&self.text[last_end..])?;
}
Ok(())
}
/// Writes the `Tagged` object as a JSON object to the given writer.
///
/// This is useful for exporting structured metadata about the IPs found in the text.
#[inline]
pub fn write_json<W: Write + ?Sized>(&mut self, wtr: &mut W) -> io::Result<()> {
// Set the text data for JSON serialization
if self.text_data.is_none() {
// Fast path for direct UTF-8 conversion
if self.text.is_empty() {
self.text_data = Some(TextData {
text: String::new(),
});
} else if let Ok(s) = std::str::from_utf8(&self.text) {
// Direct conversion without allocation
self.text_data = Some(TextData {
text: s.to_string(),
});
} else {
// Fallback for non-UTF8
self.text_data = Some(TextData {
text: String::from_utf8_lossy(&self.text).to_string(),
});
}
}
// Serialize to JSON using the faster non-pretty writer
serde_json::to_writer(wtr, self)?;
Ok(())
}
}