ip_extract/tag.rs
1use serde::Serialize;
2use std::io::{self, Write};
3use std::ops::Range;
4
5/// A tag representing an IP address found in text.
6#[derive(Clone, Debug, Serialize)]
7pub struct Tag {
8 /// The raw matched text from the haystack (may include defang brackets).
9 matched: String,
10 /// The clean IP address (defang brackets stripped).
11 #[serde(rename = "value")]
12 ip: String,
13 /// The range in the original text where the IP was found.
14 #[serde(skip_serializing_if = "Option::is_none")]
15 range: Option<Range<usize>>,
16 /// The decorated IP with geolocation information.
17 #[serde(skip_serializing_if = "Option::is_none")]
18 decorated: Option<String>,
19}
20
21impl Tag {
22 /// Create a new tag for an IP address.
23 ///
24 /// `matched` is the raw text as it appeared in the haystack (may contain
25 /// defang brackets). `ip` is the clean, refanged IP address.
26 #[inline]
27 pub fn new<S: Into<String>, T: Into<String>>(matched: S, ip: T) -> Tag {
28 Tag {
29 matched: matched.into(),
30 ip: ip.into(),
31 range: None,
32 decorated: None,
33 }
34 }
35
36 /// Set the byte range [start, end) where this tag was found in the original text.
37 #[inline]
38 #[must_use]
39 pub fn with_range(mut self, range: Range<usize>) -> Self {
40 self.range = Some(range);
41 self
42 }
43
44 /// Set a "decorated" version of the IP (e.g., with geolocation metadata).
45 ///
46 /// This string will be used instead of the original IP when calling `Tagged::write`.
47 #[inline]
48 pub fn with_decoration<S: Into<String>>(mut self, decorated: S) -> Self {
49 self.decorated = Some(decorated.into());
50 self
51 }
52
53 /// Get the clean IP address text.
54 #[inline]
55 #[must_use]
56 pub fn ip(&self) -> &str {
57 &self.ip
58 }
59
60 /// Get the raw matched text from the haystack.
61 #[inline]
62 #[must_use]
63 pub fn matched(&self) -> &str {
64 &self.matched
65 }
66
67 /// Get the range of this tag in the original text, if available.
68 #[inline]
69 #[must_use]
70 pub fn range(&self) -> Option<&Range<usize>> {
71 self.range.as_ref()
72 }
73
74 /// Get the decorated version of this IP, if available.
75 #[inline]
76 #[must_use]
77 pub fn decorated(&self) -> Option<&str> {
78 self.decorated.as_deref()
79 }
80}
81
82/// A line of text with tags.
83#[derive(Clone, Debug, Serialize)]
84pub struct Tagged {
85 /// The original text.
86 #[serde(skip_serializing)]
87 text: Vec<u8>,
88 /// The tags found in the text.
89 tags: Vec<Tag>,
90 /// The original text as a string (for JSON serialization).
91 #[serde(rename = "data")]
92 text_data: Option<TextData>,
93}
94
95/// Represents the text data for JSON serialization.
96#[derive(Clone, Debug, Serialize)]
97pub struct TextData {
98 /// The original text as a string.
99 pub text: String,
100}
101
102impl Tagged {
103 /// Create a new `Tagged` container for a slice of text.
104 ///
105 /// This container holds the original text and will collect any `Tag`s found within it.
106 #[inline]
107 #[must_use]
108 pub fn new(text: &[u8]) -> Tagged {
109 // Pre-allocate a reasonable capacity for tags based on text length
110 let capacity = if text.len() > 1000 { 16 } else { 4 };
111 Tagged {
112 text: text.to_vec(),
113 tags: Vec::with_capacity(capacity), // Most lines have few IPs
114 text_data: None,
115 }
116 }
117
118 /// Adds a tag to this text.
119 ///
120 /// The tag should contain a range that corresponds to its position in `self.text()`.
121 #[inline]
122 #[must_use]
123 pub fn tag(mut self, tag: Tag) -> Self {
124 self.tags.push(tag);
125 self
126 }
127
128 /// Get the tags in this text.
129 #[inline]
130 #[must_use]
131 pub fn tags(&self) -> &[Tag] {
132 &self.tags
133 }
134
135 /// Get the original text.
136 #[inline]
137 #[must_use]
138 pub fn text(&self) -> &[u8] {
139 &self.text
140 }
141
142 /// Explicitly sets the text data used for JSON serialization.
143 #[inline]
144 pub fn set_text_data(&mut self, data: TextData) {
145 self.text_data = Some(data);
146 }
147
148 /// Writes the text to the given writer, replacing tagged IPs with their decorated versions.
149 ///
150 /// If a tag has a `decorated()` value, that value is written instead of the original
151 /// bytes in its `range()`. If no decoration is present, the original bytes are written.
152 ///
153 /// Tags MUST be sorted by their start position for this to work correctly.
154 #[inline]
155 pub fn write<W: Write>(&self, wtr: &mut W) -> io::Result<()> {
156 // Fast path for no tags
157 if self.tags.is_empty() {
158 return wtr.write_all(&self.text);
159 }
160
161 // If we have only one tag (common case), optimize for it
162 if self.tags.len() == 1 {
163 let tag = &self.tags[0];
164 if let Some(range) = tag.range() {
165 // Write the text before the tag
166 wtr.write_all(&self.text[..range.start])?;
167
168 // Write the decorated version if available, or the original IP
169 if let Some(decorated) = tag.decorated() {
170 wtr.write_all(decorated.as_bytes())?;
171 } else {
172 wtr.write_all(&self.text[range.clone()])?;
173 }
174
175 // Write the text after the tag
176 wtr.write_all(&self.text[range.end..])?;
177 return Ok(());
178 }
179 }
180
181 // If we have 2 tags (another common case), optimize for it
182 if self.tags.len() == 2 {
183 // Get the two tags
184 let mut tag1 = &self.tags[0];
185 let mut tag2 = &self.tags[1];
186
187 // Ensure tag1 comes before tag2
188 if let (Some(range1), Some(range2)) = (tag1.range(), tag2.range()) {
189 if range1.start > range2.start {
190 std::mem::swap(&mut tag1, &mut tag2);
191 }
192
193 // Write in three parts: before tag1, tag1, between tags, tag2, after tag2
194 wtr.write_all(&self.text[..range1.start])?;
195
196 if let Some(decorated) = tag1.decorated() {
197 wtr.write_all(decorated.as_bytes())?;
198 } else {
199 wtr.write_all(&self.text[range1.clone()])?;
200 }
201
202 wtr.write_all(&self.text[range1.end..range2.start])?;
203
204 if let Some(decorated) = tag2.decorated() {
205 wtr.write_all(decorated.as_bytes())?;
206 } else {
207 wtr.write_all(&self.text[range2.clone()])?;
208 }
209
210 wtr.write_all(&self.text[range2.end..])?;
211 return Ok(());
212 }
213 }
214
215 // For multiple tags, process them in order
216 // Tags should always be sorted by position since the extractor finds matches left-to-right
217 #[cfg(debug_assertions)]
218 {
219 for i in 1..self.tags.len() {
220 if let (Some(prev), Some(curr)) = (self.tags[i - 1].range(), self.tags[i].range()) {
221 debug_assert!(prev.start <= curr.start, "Tags must be sorted by position");
222 }
223 }
224 }
225
226 let mut last_end = 0;
227 for tag in &self.tags {
228 if let Some(range) = tag.range() {
229 // Write the text between the previous tag and this one
230 wtr.write_all(&self.text[last_end..range.start])?;
231
232 // Write the decorated version if available, or the original IP
233 if let Some(decorated) = tag.decorated() {
234 wtr.write_all(decorated.as_bytes())?;
235 } else {
236 wtr.write_all(&self.text[range.clone()])?;
237 }
238
239 last_end = range.end;
240 }
241 }
242
243 // Write any remaining text
244 if last_end < self.text.len() {
245 wtr.write_all(&self.text[last_end..])?;
246 }
247
248 Ok(())
249 }
250
251 /// Writes the `Tagged` object as a JSON object to the given writer.
252 ///
253 /// This is useful for exporting structured metadata about the IPs found in the text.
254 #[inline]
255 pub fn write_json<W: Write + ?Sized>(&mut self, wtr: &mut W) -> io::Result<()> {
256 // Set the text data for JSON serialization
257 if self.text_data.is_none() {
258 // Fast path for direct UTF-8 conversion
259 if self.text.is_empty() {
260 self.text_data = Some(TextData {
261 text: String::new(),
262 });
263 } else if let Ok(s) = std::str::from_utf8(&self.text) {
264 // Direct conversion without allocation
265 self.text_data = Some(TextData {
266 text: s.to_string(),
267 });
268 } else {
269 // Fallback for non-UTF8
270 self.text_data = Some(TextData {
271 text: String::from_utf8_lossy(&self.text).to_string(),
272 });
273 }
274 }
275
276 // Serialize to JSON using the faster non-pretty writer
277 serde_json::to_writer(wtr, self)?;
278 Ok(())
279 }
280}