ip_extract/tag.rs
1use serde::Serialize;
2use std::io::{self, Write};
3use std::ops::Range;
4
5/// A tag representing an IP address found in text.
6#[derive(Clone, Debug, Serialize)]
7pub struct Tag {
8 /// The IP address text itself.
9 #[serde(rename = "value")]
10 ip: String,
11 /// The range in the original text where the IP was found.
12 #[serde(skip_serializing_if = "Option::is_none")]
13 range: Option<Range<usize>>,
14 /// The decorated IP with geolocation information.
15 #[serde(skip_serializing_if = "Option::is_none")]
16 decorated: Option<String>,
17}
18
19impl Tag {
20 /// Create a new tag for an IP address.
21 ///
22 /// The `ip` should be the literal text of the IP address as found in the input.
23 #[inline]
24 pub fn new<S: Into<String>>(ip: S) -> Tag {
25 Tag {
26 ip: ip.into(),
27 range: None,
28 decorated: None,
29 }
30 }
31
32 /// Set the byte range [start, end) where this tag was found in the original text.
33 #[inline]
34 #[must_use]
35 pub fn with_range(mut self, range: Range<usize>) -> Self {
36 self.range = Some(range);
37 self
38 }
39
40 /// Set a "decorated" version of the IP (e.g., with geolocation metadata).
41 ///
42 /// This string will be used instead of the original IP when calling `Tagged::write`.
43 #[inline]
44 pub fn with_decoration<S: Into<String>>(mut self, decorated: S) -> Self {
45 self.decorated = Some(decorated.into());
46 self
47 }
48
49 /// Get the IP address text.
50 #[inline]
51 #[must_use]
52 pub fn ip(&self) -> &str {
53 &self.ip
54 }
55
56 /// Get the range of this tag in the original text, if available.
57 #[inline]
58 #[must_use]
59 pub fn range(&self) -> Option<&Range<usize>> {
60 self.range.as_ref()
61 }
62
63 /// Get the decorated version of this IP, if available.
64 #[inline]
65 #[must_use]
66 pub fn decorated(&self) -> Option<&str> {
67 self.decorated.as_deref()
68 }
69}
70
71/// A line of text with tags.
72#[derive(Clone, Debug, Serialize)]
73pub struct Tagged {
74 /// The original text.
75 #[serde(skip_serializing)]
76 text: Vec<u8>,
77 /// The tags found in the text.
78 tags: Vec<Tag>,
79 /// The original text as a string (for JSON serialization).
80 #[serde(rename = "data")]
81 text_data: Option<TextData>,
82}
83
84/// Represents the text data for JSON serialization.
85#[derive(Clone, Debug, Serialize)]
86pub struct TextData {
87 /// The original text as a string.
88 pub text: String,
89}
90
91impl Tagged {
92 /// Create a new `Tagged` container for a slice of text.
93 ///
94 /// This container holds the original text and will collect any `Tag`s found within it.
95 #[inline]
96 #[must_use]
97 pub fn new(text: &[u8]) -> Tagged {
98 // Pre-allocate a reasonable capacity for tags based on text length
99 let capacity = if text.len() > 1000 { 16 } else { 4 };
100 Tagged {
101 text: text.to_vec(),
102 tags: Vec::with_capacity(capacity), // Most lines have few IPs
103 text_data: None,
104 }
105 }
106
107 /// Adds a tag to this text.
108 ///
109 /// The tag should contain a range that corresponds to its position in `self.text()`.
110 #[inline]
111 #[must_use]
112 pub fn tag(mut self, tag: Tag) -> Self {
113 self.tags.push(tag);
114 self
115 }
116
117 /// Get the tags in this text.
118 #[inline]
119 #[must_use]
120 pub fn tags(&self) -> &[Tag] {
121 &self.tags
122 }
123
124 /// Get the original text.
125 #[inline]
126 #[must_use]
127 pub fn text(&self) -> &[u8] {
128 &self.text
129 }
130
131 /// Explicitly sets the text data used for JSON serialization.
132 #[inline]
133 pub fn set_text_data(&mut self, data: TextData) {
134 self.text_data = Some(data);
135 }
136
137 /// Writes the text to the given writer, replacing tagged IPs with their decorated versions.
138 ///
139 /// If a tag has a `decorated()` value, that value is written instead of the original
140 /// bytes in its `range()`. If no decoration is present, the original bytes are written.
141 ///
142 /// Tags MUST be sorted by their start position for this to work correctly.
143 #[inline]
144 pub fn write<W: Write>(&self, wtr: &mut W) -> io::Result<()> {
145 // Fast path for no tags
146 if self.tags.is_empty() {
147 return wtr.write_all(&self.text);
148 }
149
150 // If we have only one tag (common case), optimize for it
151 if self.tags.len() == 1 {
152 let tag = &self.tags[0];
153 if let Some(range) = tag.range() {
154 // Write the text before the tag
155 wtr.write_all(&self.text[..range.start])?;
156
157 // Write the decorated version if available, or the original IP
158 if let Some(decorated) = tag.decorated() {
159 wtr.write_all(decorated.as_bytes())?;
160 } else {
161 wtr.write_all(&self.text[range.clone()])?;
162 }
163
164 // Write the text after the tag
165 wtr.write_all(&self.text[range.end..])?;
166 return Ok(());
167 }
168 }
169
170 // If we have 2 tags (another common case), optimize for it
171 if self.tags.len() == 2 {
172 // Get the two tags
173 let mut tag1 = &self.tags[0];
174 let mut tag2 = &self.tags[1];
175
176 // Ensure tag1 comes before tag2
177 if let (Some(range1), Some(range2)) = (tag1.range(), tag2.range()) {
178 if range1.start > range2.start {
179 std::mem::swap(&mut tag1, &mut tag2);
180 }
181
182 // Write in three parts: before tag1, tag1, between tags, tag2, after tag2
183 wtr.write_all(&self.text[..range1.start])?;
184
185 if let Some(decorated) = tag1.decorated() {
186 wtr.write_all(decorated.as_bytes())?;
187 } else {
188 wtr.write_all(&self.text[range1.clone()])?;
189 }
190
191 wtr.write_all(&self.text[range1.end..range2.start])?;
192
193 if let Some(decorated) = tag2.decorated() {
194 wtr.write_all(decorated.as_bytes())?;
195 } else {
196 wtr.write_all(&self.text[range2.clone()])?;
197 }
198
199 wtr.write_all(&self.text[range2.end..])?;
200 return Ok(());
201 }
202 }
203
204 // For multiple tags, process them in order
205 // Tags should always be sorted by position since the extractor finds matches left-to-right
206 #[cfg(debug_assertions)]
207 {
208 for i in 1..self.tags.len() {
209 if let (Some(prev), Some(curr)) = (self.tags[i - 1].range(), self.tags[i].range()) {
210 debug_assert!(prev.start <= curr.start, "Tags must be sorted by position");
211 }
212 }
213 }
214
215 let mut last_end = 0;
216 for tag in &self.tags {
217 if let Some(range) = tag.range() {
218 // Write the text between the previous tag and this one
219 wtr.write_all(&self.text[last_end..range.start])?;
220
221 // Write the decorated version if available, or the original IP
222 if let Some(decorated) = tag.decorated() {
223 wtr.write_all(decorated.as_bytes())?;
224 } else {
225 wtr.write_all(&self.text[range.clone()])?;
226 }
227
228 last_end = range.end;
229 }
230 }
231
232 // Write any remaining text
233 if last_end < self.text.len() {
234 wtr.write_all(&self.text[last_end..])?;
235 }
236
237 Ok(())
238 }
239
240 /// Writes the `Tagged` object as a JSON object to the given writer.
241 ///
242 /// This is useful for exporting structured metadata about the IPs found in the text.
243 #[inline]
244 pub fn write_json<W: Write + ?Sized>(&mut self, wtr: &mut W) -> io::Result<()> {
245 // Set the text data for JSON serialization
246 if self.text_data.is_none() {
247 // Fast path for direct UTF-8 conversion
248 if self.text.is_empty() {
249 self.text_data = Some(TextData {
250 text: String::new(),
251 });
252 } else if let Ok(s) = std::str::from_utf8(&self.text) {
253 // Direct conversion without allocation
254 self.text_data = Some(TextData {
255 text: s.to_string(),
256 });
257 } else {
258 // Fallback for non-UTF8
259 self.text_data = Some(TextData {
260 text: String::from_utf8_lossy(&self.text).to_string(),
261 });
262 }
263 }
264
265 // Serialize to JSON using the faster non-pretty writer
266 serde_json::to_writer(wtr, self)?;
267 Ok(())
268 }
269}