Skip to main content

ip_extract/
tag.rs

1use serde::Serialize;
2use std::io::{self, Write};
3use std::ops::Range;
4
5/// A tag representing an IP address found in text.
6#[derive(Clone, Debug, Serialize)]
7pub struct Tag {
8    /// The raw matched text from the haystack (may include defang brackets).
9    matched: String,
10    /// The clean IP address (defang brackets stripped).
11    #[serde(rename = "value")]
12    ip: String,
13    /// The range in the original text where the IP was found.
14    #[serde(skip_serializing_if = "Option::is_none")]
15    range: Option<Range<usize>>,
16    /// The decorated IP with geolocation information.
17    #[serde(skip_serializing_if = "Option::is_none")]
18    decorated: Option<String>,
19}
20
21impl Tag {
22    /// Create a new tag for an IP address.
23    ///
24    /// `matched` is the raw text as it appeared in the haystack (may contain
25    /// defang brackets). `ip` is the clean, refanged IP address.
26    #[inline]
27    pub fn new<S: Into<String>, T: Into<String>>(matched: S, ip: T) -> Tag {
28        Tag {
29            matched: matched.into(),
30            ip: ip.into(),
31            range: None,
32            decorated: None,
33        }
34    }
35
36    /// Set the byte range [start, end) where this tag was found in the original text.
37    #[inline]
38    #[must_use]
39    pub fn with_range(mut self, range: Range<usize>) -> Self {
40        self.range = Some(range);
41        self
42    }
43
44    /// Set a "decorated" version of the IP (e.g., with geolocation metadata).
45    ///
46    /// This string will be used instead of the original IP when calling `Tagged::write`.
47    #[inline]
48    pub fn with_decoration<S: Into<String>>(mut self, decorated: S) -> Self {
49        self.decorated = Some(decorated.into());
50        self
51    }
52
53    /// Get the clean IP address text.
54    #[inline]
55    #[must_use]
56    pub fn ip(&self) -> &str {
57        &self.ip
58    }
59
60    /// Get the raw matched text from the haystack.
61    #[inline]
62    #[must_use]
63    pub fn matched(&self) -> &str {
64        &self.matched
65    }
66
67    /// Get the range of this tag in the original text, if available.
68    #[inline]
69    #[must_use]
70    pub fn range(&self) -> Option<&Range<usize>> {
71        self.range.as_ref()
72    }
73
74    /// Get the decorated version of this IP, if available.
75    #[inline]
76    #[must_use]
77    pub fn decorated(&self) -> Option<&str> {
78        self.decorated.as_deref()
79    }
80}
81
82/// A line of text with tags.
83#[derive(Clone, Debug, Serialize)]
84pub struct Tagged {
85    /// The original text.
86    #[serde(skip_serializing)]
87    text: Vec<u8>,
88    /// The tags found in the text.
89    tags: Vec<Tag>,
90    /// The original text as a string (for JSON serialization).
91    #[serde(rename = "data")]
92    text_data: Option<TextData>,
93}
94
95/// Represents the text data for JSON serialization.
96#[derive(Clone, Debug, Serialize)]
97pub struct TextData {
98    /// The original text as a string.
99    pub text: String,
100}
101
102impl Tagged {
103    /// Create a new `Tagged` container for a slice of text.
104    ///
105    /// This container holds the original text and will collect any `Tag`s found within it.
106    #[inline]
107    #[must_use]
108    pub fn new(text: &[u8]) -> Tagged {
109        // Pre-allocate a reasonable capacity for tags based on text length
110        let capacity = if text.len() > 1000 { 16 } else { 4 };
111        Tagged {
112            text: text.to_vec(),
113            tags: Vec::with_capacity(capacity), // Most lines have few IPs
114            text_data: None,
115        }
116    }
117
118    /// Adds a tag to this text.
119    ///
120    /// The tag should contain a range that corresponds to its position in `self.text()`.
121    #[inline]
122    #[must_use]
123    pub fn tag(mut self, tag: Tag) -> Self {
124        self.tags.push(tag);
125        self
126    }
127
128    /// Get the tags in this text.
129    #[inline]
130    #[must_use]
131    pub fn tags(&self) -> &[Tag] {
132        &self.tags
133    }
134
135    /// Get the original text.
136    #[inline]
137    #[must_use]
138    pub fn text(&self) -> &[u8] {
139        &self.text
140    }
141
142    /// Explicitly sets the text data used for JSON serialization.
143    #[inline]
144    pub fn set_text_data(&mut self, data: TextData) {
145        self.text_data = Some(data);
146    }
147
148    /// Writes the text to the given writer, replacing tagged IPs with their decorated versions.
149    ///
150    /// If a tag has a `decorated()` value, that value is written instead of the original
151    /// bytes in its `range()`. If no decoration is present, the original bytes are written.
152    ///
153    /// Tags MUST be sorted by their start position for this to work correctly.
154    #[inline]
155    pub fn write<W: Write>(&self, wtr: &mut W) -> io::Result<()> {
156        // Fast path for no tags
157        if self.tags.is_empty() {
158            return wtr.write_all(&self.text);
159        }
160
161        // If we have only one tag (common case), optimize for it
162        if self.tags.len() == 1 {
163            let tag = &self.tags[0];
164            if let Some(range) = tag.range() {
165                // Write the text before the tag
166                wtr.write_all(&self.text[..range.start])?;
167
168                // Write the decorated version if available, or the original IP
169                if let Some(decorated) = tag.decorated() {
170                    wtr.write_all(decorated.as_bytes())?;
171                } else {
172                    wtr.write_all(&self.text[range.clone()])?;
173                }
174
175                // Write the text after the tag
176                wtr.write_all(&self.text[range.end..])?;
177                return Ok(());
178            }
179        }
180
181        // If we have 2 tags (another common case), optimize for it
182        if self.tags.len() == 2 {
183            // Get the two tags
184            let mut tag1 = &self.tags[0];
185            let mut tag2 = &self.tags[1];
186
187            // Ensure tag1 comes before tag2
188            if let (Some(range1), Some(range2)) = (tag1.range(), tag2.range()) {
189                if range1.start > range2.start {
190                    std::mem::swap(&mut tag1, &mut tag2);
191                }
192
193                // Write in three parts: before tag1, tag1, between tags, tag2, after tag2
194                wtr.write_all(&self.text[..range1.start])?;
195
196                if let Some(decorated) = tag1.decorated() {
197                    wtr.write_all(decorated.as_bytes())?;
198                } else {
199                    wtr.write_all(&self.text[range1.clone()])?;
200                }
201
202                wtr.write_all(&self.text[range1.end..range2.start])?;
203
204                if let Some(decorated) = tag2.decorated() {
205                    wtr.write_all(decorated.as_bytes())?;
206                } else {
207                    wtr.write_all(&self.text[range2.clone()])?;
208                }
209
210                wtr.write_all(&self.text[range2.end..])?;
211                return Ok(());
212            }
213        }
214
215        // For multiple tags, process them in order
216        // Tags should always be sorted by position since the extractor finds matches left-to-right
217        #[cfg(debug_assertions)]
218        {
219            for i in 1..self.tags.len() {
220                if let (Some(prev), Some(curr)) = (self.tags[i - 1].range(), self.tags[i].range()) {
221                    debug_assert!(prev.start <= curr.start, "Tags must be sorted by position");
222                }
223            }
224        }
225
226        let mut last_end = 0;
227        for tag in &self.tags {
228            if let Some(range) = tag.range() {
229                // Write the text between the previous tag and this one
230                wtr.write_all(&self.text[last_end..range.start])?;
231
232                // Write the decorated version if available, or the original IP
233                if let Some(decorated) = tag.decorated() {
234                    wtr.write_all(decorated.as_bytes())?;
235                } else {
236                    wtr.write_all(&self.text[range.clone()])?;
237                }
238
239                last_end = range.end;
240            }
241        }
242
243        // Write any remaining text
244        if last_end < self.text.len() {
245            wtr.write_all(&self.text[last_end..])?;
246        }
247
248        Ok(())
249    }
250
251    /// Writes the `Tagged` object as a JSON object to the given writer.
252    ///
253    /// This is useful for exporting structured metadata about the IPs found in the text.
254    #[inline]
255    pub fn write_json<W: Write + ?Sized>(&mut self, wtr: &mut W) -> io::Result<()> {
256        // Set the text data for JSON serialization
257        if self.text_data.is_none() {
258            // Fast path for direct UTF-8 conversion
259            if self.text.is_empty() {
260                self.text_data = Some(TextData {
261                    text: String::new(),
262                });
263            } else if let Ok(s) = std::str::from_utf8(&self.text) {
264                // Direct conversion without allocation
265                self.text_data = Some(TextData {
266                    text: s.to_string(),
267                });
268            } else {
269                // Fallback for non-UTF8
270                self.text_data = Some(TextData {
271                    text: String::from_utf8_lossy(&self.text).to_string(),
272                });
273            }
274        }
275
276        // Serialize to JSON using the faster non-pretty writer
277        serde_json::to_writer(wtr, self)?;
278        Ok(())
279    }
280}