Skip to main content

ip_extract/
tag.rs

1use serde::Serialize;
2use std::io::{self, Write};
3use std::ops::Range;
4
5/// A tag representing an IP address found in text.
6#[derive(Clone, Debug, Serialize)]
7pub struct Tag {
8    /// The IP address text itself.
9    #[serde(rename = "value")]
10    ip: String,
11    /// The range in the original text where the IP was found.
12    #[serde(skip_serializing_if = "Option::is_none")]
13    range: Option<Range<usize>>,
14    /// The decorated IP with geolocation information.
15    #[serde(skip_serializing_if = "Option::is_none")]
16    decorated: Option<String>,
17}
18
19impl Tag {
20    /// Create a new tag for an IP address.
21    ///
22    /// The `ip` should be the literal text of the IP address as found in the input.
23    #[inline]
24    pub fn new<S: Into<String>>(ip: S) -> Tag {
25        Tag {
26            ip: ip.into(),
27            range: None,
28            decorated: None,
29        }
30    }
31
32    /// Set the byte range [start, end) where this tag was found in the original text.
33    #[inline]
34    #[must_use]
35    pub fn with_range(mut self, range: Range<usize>) -> Self {
36        self.range = Some(range);
37        self
38    }
39
40    /// Set a "decorated" version of the IP (e.g., with geolocation metadata).
41    ///
42    /// This string will be used instead of the original IP when calling `Tagged::write`.
43    #[inline]
44    pub fn with_decoration<S: Into<String>>(mut self, decorated: S) -> Self {
45        self.decorated = Some(decorated.into());
46        self
47    }
48
49    /// Get the IP address text.
50    #[inline]
51    #[must_use]
52    pub fn ip(&self) -> &str {
53        &self.ip
54    }
55
56    /// Get the range of this tag in the original text, if available.
57    #[inline]
58    #[must_use]
59    pub fn range(&self) -> Option<&Range<usize>> {
60        self.range.as_ref()
61    }
62
63    /// Get the decorated version of this IP, if available.
64    #[inline]
65    #[must_use]
66    pub fn decorated(&self) -> Option<&str> {
67        self.decorated.as_deref()
68    }
69}
70
71/// A line of text with tags.
72#[derive(Clone, Debug, Serialize)]
73pub struct Tagged {
74    /// The original text.
75    #[serde(skip_serializing)]
76    text: Vec<u8>,
77    /// The tags found in the text.
78    tags: Vec<Tag>,
79    /// The original text as a string (for JSON serialization).
80    #[serde(rename = "data")]
81    text_data: Option<TextData>,
82}
83
84/// Represents the text data for JSON serialization.
85#[derive(Clone, Debug, Serialize)]
86pub struct TextData {
87    /// The original text as a string.
88    pub text: String,
89}
90
91impl Tagged {
92    /// Create a new `Tagged` container for a slice of text.
93    ///
94    /// This container holds the original text and will collect any `Tag`s found within it.
95    #[inline]
96    #[must_use]
97    pub fn new(text: &[u8]) -> Tagged {
98        // Pre-allocate a reasonable capacity for tags based on text length
99        let capacity = if text.len() > 1000 { 16 } else { 4 };
100        Tagged {
101            text: text.to_vec(),
102            tags: Vec::with_capacity(capacity), // Most lines have few IPs
103            text_data: None,
104        }
105    }
106
107    /// Adds a tag to this text.
108    ///
109    /// The tag should contain a range that corresponds to its position in `self.text()`.
110    #[inline]
111    #[must_use]
112    pub fn tag(mut self, tag: Tag) -> Self {
113        self.tags.push(tag);
114        self
115    }
116
117    /// Get the tags in this text.
118    #[inline]
119    #[must_use]
120    pub fn tags(&self) -> &[Tag] {
121        &self.tags
122    }
123
124    /// Get the original text.
125    #[inline]
126    #[must_use]
127    pub fn text(&self) -> &[u8] {
128        &self.text
129    }
130
131    /// Explicitly sets the text data used for JSON serialization.
132    #[inline]
133    pub fn set_text_data(&mut self, data: TextData) {
134        self.text_data = Some(data);
135    }
136
137    /// Writes the text to the given writer, replacing tagged IPs with their decorated versions.
138    ///
139    /// If a tag has a `decorated()` value, that value is written instead of the original
140    /// bytes in its `range()`. If no decoration is present, the original bytes are written.
141    ///
142    /// Tags MUST be sorted by their start position for this to work correctly.
143    #[inline]
144    pub fn write<W: Write>(&self, wtr: &mut W) -> io::Result<()> {
145        // Fast path for no tags
146        if self.tags.is_empty() {
147            return wtr.write_all(&self.text);
148        }
149
150        // If we have only one tag (common case), optimize for it
151        if self.tags.len() == 1 {
152            let tag = &self.tags[0];
153            if let Some(range) = tag.range() {
154                // Write the text before the tag
155                wtr.write_all(&self.text[..range.start])?;
156
157                // Write the decorated version if available, or the original IP
158                if let Some(decorated) = tag.decorated() {
159                    wtr.write_all(decorated.as_bytes())?;
160                } else {
161                    wtr.write_all(&self.text[range.clone()])?;
162                }
163
164                // Write the text after the tag
165                wtr.write_all(&self.text[range.end..])?;
166                return Ok(());
167            }
168        }
169
170        // If we have 2 tags (another common case), optimize for it
171        if self.tags.len() == 2 {
172            // Get the two tags
173            let mut tag1 = &self.tags[0];
174            let mut tag2 = &self.tags[1];
175
176            // Ensure tag1 comes before tag2
177            if let (Some(range1), Some(range2)) = (tag1.range(), tag2.range()) {
178                if range1.start > range2.start {
179                    std::mem::swap(&mut tag1, &mut tag2);
180                }
181
182                // Write in three parts: before tag1, tag1, between tags, tag2, after tag2
183                wtr.write_all(&self.text[..range1.start])?;
184
185                if let Some(decorated) = tag1.decorated() {
186                    wtr.write_all(decorated.as_bytes())?;
187                } else {
188                    wtr.write_all(&self.text[range1.clone()])?;
189                }
190
191                wtr.write_all(&self.text[range1.end..range2.start])?;
192
193                if let Some(decorated) = tag2.decorated() {
194                    wtr.write_all(decorated.as_bytes())?;
195                } else {
196                    wtr.write_all(&self.text[range2.clone()])?;
197                }
198
199                wtr.write_all(&self.text[range2.end..])?;
200                return Ok(());
201            }
202        }
203
204        // For multiple tags, process them in order
205        // Tags should always be sorted by position since the extractor finds matches left-to-right
206        #[cfg(debug_assertions)]
207        {
208            for i in 1..self.tags.len() {
209                if let (Some(prev), Some(curr)) = (self.tags[i - 1].range(), self.tags[i].range()) {
210                    debug_assert!(prev.start <= curr.start, "Tags must be sorted by position");
211                }
212            }
213        }
214
215        let mut last_end = 0;
216        for tag in &self.tags {
217            if let Some(range) = tag.range() {
218                // Write the text between the previous tag and this one
219                wtr.write_all(&self.text[last_end..range.start])?;
220
221                // Write the decorated version if available, or the original IP
222                if let Some(decorated) = tag.decorated() {
223                    wtr.write_all(decorated.as_bytes())?;
224                } else {
225                    wtr.write_all(&self.text[range.clone()])?;
226                }
227
228                last_end = range.end;
229            }
230        }
231
232        // Write any remaining text
233        if last_end < self.text.len() {
234            wtr.write_all(&self.text[last_end..])?;
235        }
236
237        Ok(())
238    }
239
240    /// Writes the `Tagged` object as a JSON object to the given writer.
241    ///
242    /// This is useful for exporting structured metadata about the IPs found in the text.
243    #[inline]
244    pub fn write_json<W: Write + ?Sized>(&mut self, wtr: &mut W) -> io::Result<()> {
245        // Set the text data for JSON serialization
246        if self.text_data.is_none() {
247            // Fast path for direct UTF-8 conversion
248            if self.text.is_empty() {
249                self.text_data = Some(TextData {
250                    text: String::new(),
251                });
252            } else if let Ok(s) = std::str::from_utf8(&self.text) {
253                // Direct conversion without allocation
254                self.text_data = Some(TextData {
255                    text: s.to_string(),
256                });
257            } else {
258                // Fallback for non-UTF8
259                self.text_data = Some(TextData {
260                    text: String::from_utf8_lossy(&self.text).to_string(),
261                });
262            }
263        }
264
265        // Serialize to JSON using the faster non-pretty writer
266        serde_json::to_writer(wtr, self)?;
267        Ok(())
268    }
269}