use crate::{
field_range::{FieldRange, RegexOrString},
line_parser::LineParser,
mmap::MmapChoice,
single_byte_delim_parser::SingleByteDelimParser,
};
use anyhow::Result;
use bstr::ByteSlice;
use flate2::read::MultiGzDecoder;
use grep_cli::DecompressionReaderBuilder;
use regex::bytes::Regex;
use ripline::{
line_buffer::{LineBuffer, LineBufferReader},
lines::{self, LineIter},
LineTerminator,
};
use std::{
fs::File,
io::{self, BufRead, BufReader, Read, Write},
path::Path,
};
const DEFAULT_DELIM: &[u8] = &[b'\t'];
pub enum HckInput<P: AsRef<Path>> {
Stdin,
Path(P),
}
#[derive(Debug, Clone)]
pub struct CoreConfig<'a> {
delimiter: &'a [u8],
output_delimiter: &'a [u8],
line_terminator: LineTerminator,
mmap_choice: MmapChoice,
is_parser_regex: bool,
try_decompress: bool,
raw_fields: Option<&'a str>,
raw_header_fields: Option<&'a [Regex]>,
raw_exclude: Option<&'a str>,
raw_exclude_headers: Option<&'a [Regex]>,
header_is_regex: bool,
parsed_delim: RegexOrString,
}
impl<'a> Default for CoreConfig<'a> {
fn default() -> Self {
Self {
delimiter: DEFAULT_DELIM,
output_delimiter: DEFAULT_DELIM,
line_terminator: LineTerminator::default(),
mmap_choice: unsafe { MmapChoice::auto() },
is_parser_regex: false,
try_decompress: false,
raw_fields: Some("1-"),
raw_header_fields: None,
raw_exclude: None,
raw_exclude_headers: None,
header_is_regex: false,
parsed_delim: RegexOrString::String(
std::str::from_utf8(DEFAULT_DELIM).unwrap().to_string(),
),
}
}
}
impl<'a> CoreConfig<'a> {
pub fn parsed_delim(&self) -> &RegexOrString {
&self.parsed_delim
}
pub fn peek_first_line<P: AsRef<Path>>(
&self,
input: &HckInput<P>,
) -> Result<Vec<u8>, io::Error> {
let mut buffer = String::new();
match input {
HckInput::Stdin => {
if self.try_decompress {
unimplemented!("Header selections not supported when piping gzipped stdin")
}
io::stdin().read_line(&mut buffer)?;
}
HckInput::Path(path) => {
if self.try_decompress {
let reader: Box<dyn Read> = if path
.as_ref()
.to_str()
.map(|p| p.ends_with(".gz"))
.unwrap_or(false)
{
Box::new(MultiGzDecoder::new(File::open(path)?))
} else {
Box::new(
DecompressionReaderBuilder::new()
.build(path)?,
)
};
let mut reader = BufReader::new(reader);
reader.read_line(&mut buffer)?;
} else {
BufReader::new(File::open(path)?).read_line(&mut buffer)?;
}
}
}
Ok(lines::without_terminator(buffer.as_bytes(), self.line_terminator).to_owned())
}
pub fn parse_fields<P>(&self, input: &HckInput<P>) -> Result<(Option<Vec<u8>>, Vec<FieldRange>)>
where
P: AsRef<Path>,
{
let (mut extra, fields) = match (self.raw_fields, self.raw_header_fields) {
(Some(field_list), Some(header_fields)) => {
let first_line = self.peek_first_line(input)?;
let mut fields = FieldRange::from_list(field_list)?;
let header_fields = FieldRange::from_header_list(
header_fields,
first_line.as_bytes(),
&self.parsed_delim,
self.header_is_regex,
false,
)?;
fields.extend(header_fields.into_iter());
FieldRange::post_process_ranges(&mut fields);
(Some(first_line), fields)
}
(Some(field_list), None) => (None, FieldRange::from_list(field_list)?),
(None, Some(header_fields)) => {
let first_line = self.peek_first_line(input)?;
let fields = FieldRange::from_header_list(
header_fields,
first_line.as_bytes(),
&self.parsed_delim,
self.header_is_regex,
false,
)?;
(Some(first_line), fields)
}
(None, None) => (None, FieldRange::from_list("1-")?),
};
let fields = match (&self.raw_exclude, &self.raw_exclude_headers) {
(Some(exclude), Some(exclude_header)) => {
let exclude = FieldRange::from_list(exclude)?;
let fields = FieldRange::exclude(fields, exclude);
let first_line = if let Some(first_line) = extra {
first_line
} else {
self.peek_first_line(input)?
};
let exclude_headers = FieldRange::from_header_list(
exclude_header,
first_line.as_bytes(),
&self.parsed_delim,
self.header_is_regex,
true,
)?;
extra = Some(first_line);
FieldRange::exclude(fields, exclude_headers)
}
(Some(exclude), None) => {
let exclude = FieldRange::from_list(exclude)?;
FieldRange::exclude(fields, exclude)
}
(None, Some(exclude_header)) => {
let first_line = if let Some(first_line) = extra {
first_line
} else {
self.peek_first_line(input)?
};
let exclude_headers = FieldRange::from_header_list(
exclude_header,
first_line.as_bytes(),
&self.parsed_delim,
self.header_is_regex,
true,
)?;
extra = Some(first_line);
FieldRange::exclude(fields, exclude_headers)
}
(None, None) => fields,
};
Ok((extra, fields))
}
}
#[derive(Clone, Debug)]
pub struct CoreConfigBuilder<'a> {
config: CoreConfig<'a>,
}
impl<'a> CoreConfigBuilder<'a> {
pub fn new() -> Self {
Self {
config: CoreConfig::default(),
}
}
pub fn build(mut self) -> Result<CoreConfig<'a>> {
let delim = if self.config.is_parser_regex {
RegexOrString::Regex(Regex::new(self.config.delimiter.to_str()?)?)
} else {
let unescaped =
std::str::from_utf8(&grep_cli::unescape(self.config.delimiter.to_str()?))?
.to_string();
RegexOrString::String(unescaped)
};
self.config.parsed_delim = delim;
Ok(self.config)
}
pub fn delimiter(mut self, delim: &'a [u8]) -> Self {
self.config.delimiter = delim;
self
}
pub fn output_delimiter(mut self, delim: &'a [u8]) -> Self {
self.config.output_delimiter = delim;
self
}
pub fn line_terminator(mut self, term: LineTerminator) -> Self {
self.config.line_terminator = term;
self
}
pub fn mmap(mut self, mmap_choice: MmapChoice) -> Self {
self.config.mmap_choice = mmap_choice;
self
}
#[allow(clippy::wrong_self_convention)]
pub fn is_regex_parser(mut self, is_regex: bool) -> Self {
self.config.is_parser_regex = is_regex;
self
}
pub fn try_decompress(mut self, try_decompress: bool) -> Self {
self.config.try_decompress = try_decompress;
self
}
pub fn fields(mut self, fields: Option<&'a str>) -> Self {
self.config.raw_fields = fields;
self
}
pub fn headers(mut self, headers: Option<&'a [Regex]>) -> Self {
self.config.raw_header_fields = headers;
self
}
pub fn exclude(mut self, exclude: Option<&'a str>) -> Self {
self.config.raw_exclude = exclude;
self
}
pub fn exclude_headers(mut self, exclude_headers: Option<&'a [Regex]>) -> Self {
self.config.raw_exclude_headers = exclude_headers;
self
}
pub fn header_is_regex(mut self, header_is_regex: bool) -> Self {
self.config.header_is_regex = header_is_regex;
self
}
}
impl<'a> Default for CoreConfigBuilder<'a> {
fn default() -> Self {
Self::new()
}
}
pub struct Core<'a, L> {
config: &'a CoreConfig<'a>,
fields: &'a [FieldRange],
line_parser: L,
line_buffer: &'a mut LineBuffer,
}
impl<'a, L> Core<'a, L>
where
L: LineParser<'a>,
{
pub fn new(
config: &'a CoreConfig,
fields: &'a [FieldRange],
line_parser: L,
line_buffer: &'a mut LineBuffer,
) -> Self {
Self {
config,
fields,
line_parser,
line_buffer,
}
}
#[inline]
fn are_fields_pos_sorted(&self) -> bool {
let mut test = 0;
for field in self.fields {
if field.pos < test {
return false;
}
test = field.pos
}
true
}
fn allow_fastmode(&self) -> bool {
self.config.delimiter.len() == 1
&& self.config.line_terminator.as_bytes().len() == 1
&& !self.config.is_parser_regex
&& self.are_fields_pos_sorted()
}
pub fn hck_input<P, W>(
&mut self,
input: HckInput<P>,
mut output: W,
header: Option<Vec<u8>>,
) -> Result<(), io::Error>
where
P: AsRef<Path>,
W: Write,
{
match input {
HckInput::Stdin => {
if let Some(header) = header {
self.hck_bytes(header.as_bytes(), &mut output)?;
}
let reader: Box<dyn Read> = if self.config.try_decompress {
Box::new(MultiGzDecoder::new(io::stdin()))
} else {
Box::new(io::stdin())
};
if self.allow_fastmode() {
self.hck_reader_fast(reader, &mut output)
} else {
self.hck_reader(reader, &mut output)
}
}
HckInput::Path(path) => {
if self.config.try_decompress {
let reader: Box<dyn Read> = if path
.as_ref()
.to_str()
.map(|p| p.ends_with(".gz"))
.unwrap_or(false)
{
Box::new(MultiGzDecoder::new(File::open(&path)?))
} else {
Box::new(
DecompressionReaderBuilder::new()
.build(&path)?,
)
};
if self.allow_fastmode() {
self.hck_reader_fast(reader, &mut output)
} else {
self.hck_reader(reader, &mut output)
}
} else {
let file = File::open(&path)?;
if let Some(mmap) = self.config.mmap_choice.open(&file, Some(&path)) {
if self.allow_fastmode() {
self.hck_bytes_fast(mmap.as_bytes(), &mut output)
} else {
self.hck_bytes(mmap.as_bytes(), &mut output)
}
} else if self.allow_fastmode() {
self.hck_reader_fast(file, &mut output)
} else {
self.hck_reader(file, &mut output)
}
}
}
}
}
pub fn hck_bytes<W>(&mut self, bytes: &[u8], mut output: W) -> Result<(), io::Error>
where
W: Write,
{
let iter = LineIter::new(self.config.line_terminator.as_byte(), bytes.as_bytes());
let mut shuffler: Vec<Vec<&'static [u8]>> =
vec![vec![]; self.fields.iter().map(|f| f.pos).max().unwrap() + 1];
for line in iter {
let mut s: Vec<Vec<&[u8]>> = shuffler;
self.line_parser.parse_line(
lines::without_terminator(line, self.config.line_terminator),
&mut s,
);
let items = s.iter_mut().flat_map(|s| s.drain(..));
output.join_append(
self.config.output_delimiter,
items,
&self.config.line_terminator,
)?;
shuffler = unsafe { core::mem::transmute(s) };
}
Ok(())
}
pub fn hck_bytes_fast<W: Write>(&mut self, bytes: &[u8], output: W) -> Result<(), io::Error> {
let mut buffer_parser = SingleByteDelimParser::new(
self.config.line_terminator,
self.config.output_delimiter,
self.fields,
self.config.delimiter[0],
);
buffer_parser.process_buffer(bytes, output)?;
Ok(())
}
pub fn hck_reader_fast<R: Read, W: Write>(
&mut self,
reader: R,
mut output: W,
) -> Result<(), io::Error> {
let mut reader = LineBufferReader::new(reader, self.line_buffer);
let mut buffer_parser = SingleByteDelimParser::new(
self.config.line_terminator,
self.config.output_delimiter,
self.fields,
self.config.delimiter[0],
);
while reader.fill()? {
buffer_parser.process_buffer(reader.buffer(), &mut output)?;
buffer_parser.reset();
reader.consume(reader.buffer().len());
}
Ok(())
}
pub fn hck_reader<R: Read, W: Write>(
&mut self,
reader: R,
mut output: W,
) -> Result<(), io::Error> {
let mut reader = LineBufferReader::new(reader, self.line_buffer);
let mut shuffler: Vec<Vec<&'static [u8]>> =
vec![vec![]; self.fields.iter().map(|f| f.pos).max().unwrap() + 1];
while reader.fill()? {
let iter = LineIter::new(self.config.line_terminator.as_byte(), reader.buffer());
for line in iter {
let mut s: Vec<Vec<&[u8]>> = shuffler;
self.line_parser.parse_line(
lines::without_terminator(line, self.config.line_terminator),
&mut s,
);
let items = s.iter_mut().flat_map(|s| s.drain(..));
output.join_append(
self.config.output_delimiter,
items,
&self.config.line_terminator,
)?;
shuffler = unsafe { core::mem::transmute(s) };
}
reader.consume(reader.buffer().len());
}
Ok(())
}
}
pub trait JoinAppend {
fn join_append<'b>(
&mut self,
sep: &[u8],
items: impl Iterator<Item = &'b [u8]>,
term: &LineTerminator,
) -> Result<(), io::Error>;
}
impl<W: Write> JoinAppend for W {
#[inline(always)]
fn join_append<'b>(
&mut self,
sep: &[u8],
mut items: impl Iterator<Item = &'b [u8]>,
term: &LineTerminator,
) -> Result<(), io::Error> {
if let Some(item) = items.next() {
self.write_all(item)?;
}
for item in items {
self.write_all(sep)?;
self.write_all(item)?;
}
self.write_all(term.as_bytes())?;
Ok(())
}
}