1use crate::{
8 field_range::{FieldRange, RegexOrString},
9 line_parser::LineParser,
10 mmap::MmapChoice,
11 single_byte_delim_parser::SingleByteDelimParser,
12};
13use anyhow::Result;
14use bstr::ByteSlice;
15use flate2::read::MultiGzDecoder;
16use grep_cli::DecompressionReaderBuilder;
17use regex::bytes::Regex;
18use ripline::{
19 line_buffer::{LineBuffer, LineBufferReader},
20 lines::{self, LineIter},
21 LineTerminator,
22};
23use std::{
24 fs::File,
25 io::{self, BufRead, BufReader, Read, Write},
26 path::Path,
27};
28
29const DEFAULT_DELIM: &[u8] = b"\t";
30
31pub enum HckInput<P: AsRef<Path>> {
33 Stdin,
34 Path(P),
35}
36
37#[derive(Debug, Clone)]
39pub struct CoreConfig<'a> {
40 delimiter: &'a [u8],
41 output_delimiter: &'a [u8],
42 line_terminator: LineTerminator,
43 mmap_choice: MmapChoice,
44 is_parser_regex: bool,
45 try_decompress: bool,
46 raw_fields: Option<&'a str>,
47 raw_header_fields: Option<&'a [Regex]>,
48 raw_exclude: Option<&'a str>,
49 raw_exclude_headers: Option<&'a [Regex]>,
50 header_is_regex: bool,
51 parsed_delim: RegexOrString,
52}
53
54impl Default for CoreConfig<'_> {
55 fn default() -> Self {
56 Self {
57 delimiter: DEFAULT_DELIM,
58 output_delimiter: DEFAULT_DELIM,
59 line_terminator: LineTerminator::default(),
60 mmap_choice: unsafe { MmapChoice::auto() },
61 is_parser_regex: false,
62 try_decompress: false,
63 raw_fields: Some("1-"),
64 raw_header_fields: None,
65 raw_exclude: None,
66 raw_exclude_headers: None,
67 header_is_regex: false,
68 parsed_delim: RegexOrString::String(
69 std::str::from_utf8(DEFAULT_DELIM).unwrap().to_string(),
70 ),
71 }
72 }
73}
74
75impl CoreConfig<'_> {
76 pub fn parsed_delim(&self) -> &RegexOrString {
78 &self.parsed_delim
79 }
80
81 pub fn peek_first_line<P: AsRef<Path>>(
86 &self,
87 input: &HckInput<P>,
88 ) -> Result<Vec<u8>, io::Error> {
89 let mut buffer = String::new();
90 match input {
91 HckInput::Stdin => {
92 if self.try_decompress {
94 unimplemented!("Header selections not supported when piping gzipped stdin")
95 }
96 io::stdin().read_line(&mut buffer)?;
97 }
98
99 HckInput::Path(path) => {
100 if self.try_decompress {
101 let reader: Box<dyn Read> = if path
102 .as_ref()
103 .to_str()
104 .map(|p| p.ends_with(".gz"))
105 .unwrap_or(false)
106 {
107 Box::new(MultiGzDecoder::new(File::open(path)?))
108 } else {
109 Box::new(
110 DecompressionReaderBuilder::new()
111 .build(path)?,
113 )
114 };
115 let mut reader = BufReader::new(reader);
116 reader.read_line(&mut buffer)?;
117 } else {
118 BufReader::new(File::open(path)?).read_line(&mut buffer)?;
119 }
120 }
121 }
122 Ok(lines::without_terminator(buffer.as_bytes(), self.line_terminator).to_owned())
123 }
124
125 pub fn parse_fields<P>(&self, input: &HckInput<P>) -> Result<(Option<Vec<u8>>, Vec<FieldRange>)>
127 where
128 P: AsRef<Path>,
129 {
130 let (mut extra, fields) = match (self.raw_fields, self.raw_header_fields) {
132 (Some(field_list), Some(header_fields)) => {
133 let first_line = self.peek_first_line(input)?;
134 let mut fields = FieldRange::from_list(field_list)?;
135 let header_fields = FieldRange::from_header_list(
136 header_fields,
137 first_line.as_bytes(),
138 &self.parsed_delim,
139 self.header_is_regex,
140 false,
141 )?;
142 fields.extend(header_fields);
143 FieldRange::post_process_ranges(&mut fields);
144 (Some(first_line), fields)
145 }
146 (Some(field_list), None) => (None, FieldRange::from_list(field_list)?),
147 (None, Some(header_fields)) => {
148 let first_line = self.peek_first_line(input)?;
149 let fields = FieldRange::from_header_list(
150 header_fields,
151 first_line.as_bytes(),
152 &self.parsed_delim,
153 self.header_is_regex,
154 false,
155 )?;
156 (Some(first_line), fields)
157 }
158 (None, None) => (None, FieldRange::from_list("1-")?),
159 };
160
161 let fields = match (&self.raw_exclude, &self.raw_exclude_headers) {
162 (Some(exclude), Some(exclude_header)) => {
163 let exclude = FieldRange::from_list(exclude)?;
164 let fields = FieldRange::exclude(fields, exclude);
165 let first_line = if let Some(first_line) = extra {
166 first_line
167 } else {
168 self.peek_first_line(input)?
169 };
170 let exclude_headers = FieldRange::from_header_list(
171 exclude_header,
172 first_line.as_bytes(),
173 &self.parsed_delim,
174 self.header_is_regex,
175 true,
176 )?;
177 extra = Some(first_line);
178 FieldRange::exclude(fields, exclude_headers)
179 }
180 (Some(exclude), None) => {
181 let exclude = FieldRange::from_list(exclude)?;
182 FieldRange::exclude(fields, exclude)
183 }
184 (None, Some(exclude_header)) => {
185 let first_line = if let Some(first_line) = extra {
186 first_line
187 } else {
188 self.peek_first_line(input)?
189 };
190 let exclude_headers = FieldRange::from_header_list(
191 exclude_header,
192 first_line.as_bytes(),
193 &self.parsed_delim,
194 self.header_is_regex,
195 true,
196 )?;
197 extra = Some(first_line);
198 FieldRange::exclude(fields, exclude_headers)
199 }
200 (None, None) => fields,
201 };
202 Ok((extra, fields))
203 }
204}
205
206#[derive(Clone, Debug)]
208pub struct CoreConfigBuilder<'a> {
209 config: CoreConfig<'a>,
210}
211
212impl<'a> CoreConfigBuilder<'a> {
213 pub fn new() -> Self {
214 Self {
215 config: CoreConfig::default(),
216 }
217 }
218
219 pub fn build(mut self) -> Result<CoreConfig<'a>> {
220 let delim = if self.config.is_parser_regex {
221 RegexOrString::Regex(Regex::new(self.config.delimiter.to_str()?)?)
222 } else {
223 let unescaped =
224 std::str::from_utf8(&grep_cli::unescape(self.config.delimiter.to_str()?))?
225 .to_string();
226 RegexOrString::String(unescaped)
227 };
228 self.config.parsed_delim = delim;
229 Ok(self.config)
230 }
231
232 pub fn delimiter(mut self, delim: &'a [u8]) -> Self {
234 self.config.delimiter = delim;
235 self
236 }
237
238 pub fn output_delimiter(mut self, delim: &'a [u8]) -> Self {
240 self.config.output_delimiter = delim;
241 self
242 }
243
244 pub fn line_terminator(mut self, term: LineTerminator) -> Self {
246 self.config.line_terminator = term;
247 self
248 }
249
250 pub fn mmap(mut self, mmap_choice: MmapChoice) -> Self {
252 self.config.mmap_choice = mmap_choice;
253 self
254 }
255
256 #[allow(clippy::wrong_self_convention)]
258 pub fn is_regex_parser(mut self, is_regex: bool) -> Self {
259 self.config.is_parser_regex = is_regex;
260 self
261 }
262
263 pub fn try_decompress(mut self, try_decompress: bool) -> Self {
265 self.config.try_decompress = try_decompress;
266 self
267 }
268
269 pub fn fields(mut self, fields: Option<&'a str>) -> Self {
271 self.config.raw_fields = fields;
272 self
273 }
274
275 pub fn headers(mut self, headers: Option<&'a [Regex]>) -> Self {
277 self.config.raw_header_fields = headers;
278 self
279 }
280
281 pub fn exclude(mut self, exclude: Option<&'a str>) -> Self {
283 self.config.raw_exclude = exclude;
284 self
285 }
286
287 pub fn exclude_headers(mut self, exclude_headers: Option<&'a [Regex]>) -> Self {
289 self.config.raw_exclude_headers = exclude_headers;
290 self
291 }
292
293 pub fn header_is_regex(mut self, header_is_regex: bool) -> Self {
295 self.config.header_is_regex = header_is_regex;
296 self
297 }
298}
299
300impl Default for CoreConfigBuilder<'_> {
301 fn default() -> Self {
302 Self::new()
303 }
304}
305
306pub struct Core<'a, L> {
308 config: &'a CoreConfig<'a>,
310 fields: &'a [FieldRange],
312 line_parser: L,
314 line_buffer: &'a mut LineBuffer,
316}
317
318impl<'a, L> Core<'a, L>
319where
320 L: LineParser<'a>,
321{
322 pub fn new(
324 config: &'a CoreConfig,
325 fields: &'a [FieldRange],
326 line_parser: L,
327 line_buffer: &'a mut LineBuffer,
328 ) -> Self {
329 Self {
330 config,
331 fields,
332 line_parser,
333 line_buffer,
334 }
335 }
336
337 #[inline]
339 fn are_fields_pos_sorted(&self) -> bool {
340 let mut test = 0;
341 for field in self.fields {
342 if field.pos < test {
343 return false;
344 }
345 test = field.pos
346 }
347 true
348 }
349
350 fn allow_fastmode(&self) -> bool {
354 self.config.delimiter.len() == 1
355 && self.config.line_terminator.as_bytes().len() == 1
356 && !self.config.is_parser_regex
357 && self.are_fields_pos_sorted()
358 }
359
360 pub fn hck_input<P, W>(
361 &mut self,
362 input: HckInput<P>,
363 mut output: W,
364 header: Option<Vec<u8>>,
365 ) -> Result<(), io::Error>
366 where
367 P: AsRef<Path>,
368 W: Write,
369 {
370 match input {
372 HckInput::Stdin => {
373 if let Some(header) = header {
374 self.hck_bytes(header.as_bytes(), &mut output)?;
375 }
376 let reader: Box<dyn Read> = if self.config.try_decompress {
377 Box::new(MultiGzDecoder::new(io::stdin()))
378 } else {
379 Box::new(io::stdin())
380 };
381 if self.allow_fastmode() {
382 self.hck_reader_fast(reader, &mut output)
383 } else {
384 self.hck_reader(reader, &mut output)
385 }
386 }
387 HckInput::Path(path) => {
388 if self.config.try_decompress {
389 let reader: Box<dyn Read> = if path
390 .as_ref()
391 .to_str()
392 .map(|p| p.ends_with(".gz"))
393 .unwrap_or(false)
394 {
395 Box::new(MultiGzDecoder::new(File::open(&path)?))
396 } else {
397 Box::new(
398 DecompressionReaderBuilder::new()
399 .build(&path)?,
401 )
402 };
403 if self.allow_fastmode() {
404 self.hck_reader_fast(reader, &mut output)
405 } else {
406 self.hck_reader(reader, &mut output)
407 }
408 } else {
409 let file = File::open(&path)?;
410 if let Some(mmap) = self.config.mmap_choice.open(&file, Some(&path)) {
411 if self.allow_fastmode() {
412 self.hck_bytes_fast(mmap.as_bytes(), &mut output)
413 } else {
414 self.hck_bytes(mmap.as_bytes(), &mut output)
415 }
416 } else if self.allow_fastmode() {
417 self.hck_reader_fast(file, &mut output)
418 } else {
419 self.hck_reader(file, &mut output)
420 }
421 }
422 }
423 }
424 }
425
426 #[allow(clippy::missing_transmute_annotations)]
430 pub fn hck_bytes<W>(&mut self, bytes: &[u8], mut output: W) -> Result<(), io::Error>
431 where
432 W: Write,
433 {
434 let iter = LineIter::new(self.config.line_terminator.as_byte(), bytes.as_bytes());
435 let mut shuffler: Vec<Vec<&'static [u8]>> =
436 vec![vec![]; self.fields.iter().map(|f| f.pos).max().unwrap() + 1];
437 for line in iter {
438 let mut s: Vec<Vec<&[u8]>> = shuffler;
439 self.line_parser.parse_line(
440 lines::without_terminator(line, self.config.line_terminator),
441 &mut s,
442 );
443 let items = s.iter_mut().flat_map(|s| s.drain(..));
444 output.join_append(
445 self.config.output_delimiter,
446 items,
447 &self.config.line_terminator,
448 )?;
449 shuffler = unsafe { core::mem::transmute(s) };
450 }
451 Ok(())
452 }
453
454 pub fn hck_bytes_fast<W: Write>(&mut self, bytes: &[u8], output: W) -> Result<(), io::Error> {
462 let mut buffer_parser = SingleByteDelimParser::new(
463 self.config.line_terminator,
464 self.config.output_delimiter,
465 self.fields,
466 self.config.delimiter[0],
467 );
468 buffer_parser.process_buffer(bytes, output)?;
469 Ok(())
470 }
471
472 pub fn hck_reader_fast<R: Read, W: Write>(
480 &mut self,
481 reader: R,
482 mut output: W,
483 ) -> Result<(), io::Error> {
484 let mut reader = LineBufferReader::new(reader, self.line_buffer);
485 let mut buffer_parser = SingleByteDelimParser::new(
486 self.config.line_terminator,
487 self.config.output_delimiter,
488 self.fields,
489 self.config.delimiter[0],
490 );
491
492 while reader.fill()? {
493 buffer_parser.process_buffer(reader.buffer(), &mut output)?;
494 buffer_parser.reset();
495 reader.consume(reader.buffer().len());
496 }
497 Ok(())
498 }
499
500 #[allow(clippy::missing_transmute_annotations)]
502 pub fn hck_reader<R: Read, W: Write>(
503 &mut self,
504 reader: R,
505 mut output: W,
506 ) -> Result<(), io::Error> {
507 let mut reader = LineBufferReader::new(reader, self.line_buffer);
508 let mut shuffler: Vec<Vec<&'static [u8]>> =
509 vec![vec![]; self.fields.iter().map(|f| f.pos).max().unwrap() + 1];
510 while reader.fill()? {
511 let iter = LineIter::new(self.config.line_terminator.as_byte(), reader.buffer());
512
513 for line in iter {
514 let mut s: Vec<Vec<&[u8]>> = shuffler;
515 self.line_parser.parse_line(
516 lines::without_terminator(line, self.config.line_terminator),
517 &mut s,
518 );
519
520 let items = s.iter_mut().flat_map(|s| s.drain(..));
521 output.join_append(
522 self.config.output_delimiter,
523 items,
524 &self.config.line_terminator,
525 )?;
526 shuffler = unsafe { core::mem::transmute(s) };
527 }
528 reader.consume(reader.buffer().len());
529 }
530 Ok(())
531 }
532}
533
534pub trait JoinAppend {
536 fn join_append<'b>(
538 &mut self,
539 sep: &[u8],
540 items: impl Iterator<Item = &'b [u8]>,
541 term: &LineTerminator,
542 ) -> Result<(), io::Error>;
543}
544
545impl<W: Write> JoinAppend for W {
547 #[inline(always)]
549 fn join_append<'b>(
550 &mut self,
551 sep: &[u8],
552 mut items: impl Iterator<Item = &'b [u8]>,
553 term: &LineTerminator,
554 ) -> Result<(), io::Error> {
555 if let Some(item) = items.next() {
556 self.write_all(item)?;
557 }
558
559 for item in items {
560 self.write_all(sep)?;
561 self.write_all(item)?;
562 }
563 self.write_all(term.as_bytes())?;
564 Ok(())
565 }
566}