use anyhow::{Context, Error, Result};
use clap::Parser;
use env_logger::Env;
use flate2::Compression;
use git_version::git_version;
use grep_cli::{stdout, unescape};
use gzp::{ZBuilder, deflate::Bgzf};
use hcklib::{
core::{Core, CoreConfig, CoreConfigBuilder, HckInput},
field_range::RegexOrString,
line_parser::{RegexLineParser, SubStrLineParser},
mmap::MmapChoice,
};
use lazy_static::lazy_static;
use log::{error, warn};
use regex::bytes::Regex;
use ripline::{
LineTerminator,
line_buffer::{LineBuffer, LineBufferBuilder},
};
use std::{
fs::File,
io::{self, BufWriter, Write},
path::{Path, PathBuf},
process::exit,
};
use termcolor::ColorChoice;
lazy_static! {
pub static ref DEFAULT_CPUS: String = {
let num_cores = num_cpus::get();
if num_cores < 4 {
num_cores.saturating_sub(1)
} else {
4
}.to_string()
};
}
pub const HCK_VERSION: &str = git_version!(
cargo_prefix = "cargo:",
prefix = "git:",
args = ["--always", "--dirty=-modified", "--match=v*"]
);
fn select_output<P: AsRef<Path>>(output: Option<P>) -> Result<Box<dyn Write + Send + 'static>> {
let writer: Box<dyn Write + Send + 'static> = match output {
Some(path) => {
if path.as_ref().as_os_str() == "-" {
Box::new(stdout(ColorChoice::Never))
} else {
Box::new(File::create(&path).with_context(|| {
format!("Failed to open {} for writing.", path.as_ref().display())
})?)
}
}
None => Box::new(stdout(ColorChoice::Never)),
};
Ok(writer)
}
#[inline]
fn is_broken_pipe(err: &Error) -> bool {
if let Some(io_err) = err.downcast_ref::<io::Error>()
&& io_err.kind() == io::ErrorKind::BrokenPipe
{
return true;
}
false
}
#[derive(Debug, Parser)]
#[clap(author, version = HCK_VERSION)]
struct Opts {
input: Vec<PathBuf>,
#[clap(short, long, allow_hyphen_values = true)]
output: Option<PathBuf>,
#[clap(short, long, default_value = r"\s+", allow_hyphen_values = true)]
delimiter: String,
#[clap(short = 'L', long)]
delim_is_literal: bool,
#[clap(
short = 'I',
long,
requires("delim-is-literal"),
conflicts_with("output-delimiter")
)]
use_input_delim: bool,
#[clap(short = 'D', long, default_value = "\t", allow_hyphen_values = true)]
output_delimiter: String,
#[clap(short, long, allow_hyphen_values = true)]
fields: Option<String>,
#[clap(short = 'e', long, allow_hyphen_values = true)]
exclude: Option<String>,
#[clap(short = 'E', long, number_of_values = 1, allow_hyphen_values = true)]
exclude_header: Option<Vec<Regex>>,
#[clap(short = 'F', long, number_of_values = 1, allow_hyphen_values = true)]
header_field: Option<Vec<Regex>>,
#[clap(short = 'r', long)]
header_is_regex: bool,
#[clap(short = 'z', long)]
try_decompress: bool,
#[clap(short = 'Z', long)]
try_compress: bool,
#[clap(short = 't', long, default_value=&DEFAULT_CPUS.as_str())]
compression_threads: usize,
#[clap(short = 'l', long, default_value = "6")]
compression_level: u32,
#[clap(long)]
no_mmap: bool,
#[clap(long)]
crlf: bool,
}
fn main() -> Result<()> {
env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
let opts = Opts::parse();
let writer = select_output(opts.output.as_ref())?;
let mut writer: Box<dyn Write> = if opts.try_compress {
Box::new(
ZBuilder::<Bgzf, _>::new()
.compression_level(Compression::new(opts.compression_level))
.num_threads(opts.compression_threads)
.from_writer(writer),
)
} else {
Box::new(BufWriter::new(writer))
};
if opts.input.is_empty() && opts.try_decompress && opts.header_field.is_some() {
warn!(
"Selections based on header fields is not currently supported on STDIN compressed data."
);
}
let inputs: Vec<HckInput<PathBuf>> = if opts.input.is_empty() {
vec![HckInput::Stdin]
} else {
opts.input
.iter()
.map(|p| {
if p.as_os_str() == "-" {
HckInput::Stdin
} else {
HckInput::Path(p.clone())
}
})
.collect()
};
let mut conf_builder = CoreConfigBuilder::new();
let line_term = if opts.crlf {
LineTerminator::crlf()
} else {
LineTerminator::default()
};
conf_builder = conf_builder.line_terminator(line_term);
let mmap = if opts.no_mmap {
MmapChoice::never()
} else {
unsafe { MmapChoice::auto() }
};
let out_delim = if opts.delim_is_literal && opts.use_input_delim {
unescape(&opts.delimiter)
} else {
unescape(&opts.output_delimiter)
};
let conf = conf_builder
.mmap(mmap)
.delimiter(opts.delimiter.as_bytes())
.output_delimiter(&out_delim)
.is_regex_parser(!opts.delim_is_literal)
.try_decompress(opts.try_decompress)
.fields(opts.fields.as_deref())
.headers(opts.header_field.as_deref())
.exclude(opts.exclude.as_deref())
.exclude_headers(opts.exclude_header.as_deref())
.header_is_regex(opts.header_is_regex)
.build()?;
let mut line_buffer = LineBufferBuilder::new().build();
for input in inputs.into_iter() {
if let Err(err) = run(input, &mut writer, &conf, &mut line_buffer) {
if is_broken_pipe(&err) {
exit(0)
}
error!("{}", err);
exit(1)
}
}
Ok(())
}
fn run<W: Write>(
input: HckInput<PathBuf>,
writer: &mut W,
conf: &CoreConfig,
line_buffer: &mut LineBuffer,
) -> Result<()> {
let (extra, fields) = conf.parse_fields(&input)?;
if fields.is_empty() {
return Ok(());
}
match conf.parsed_delim() {
RegexOrString::Regex(regex) => {
let mut core = Core::new(
conf,
&fields,
RegexLineParser::new(&fields, regex),
line_buffer,
);
core.hck_input(input, writer, extra)?;
}
RegexOrString::String(s) => {
let mut core = Core::new(
conf,
&fields,
SubStrLineParser::new(&fields, s.as_bytes()),
line_buffer,
);
core.hck_input(input, writer, extra)?;
}
};
Ok(())
}
#[cfg(test)]
mod test {
use std::io::BufReader;
use super::*;
use bstr::io::BufReadExt;
use rstest::rstest;
use tempfile::TempDir;
fn build_opts(
input_file: impl AsRef<Path>,
output_file: impl AsRef<Path>,
fields: &str,
no_mmap: bool,
delimiter: &str,
) -> Opts {
Opts {
input: vec![input_file.as_ref().to_path_buf()],
output: Some(output_file.as_ref().to_path_buf()),
delimiter: delimiter.to_string(),
delim_is_literal: false,
output_delimiter: "\t".to_owned(),
use_input_delim: false,
fields: Some(fields.to_owned()),
header_field: None,
header_is_regex: true,
try_decompress: false,
try_compress: false,
no_mmap,
crlf: false,
exclude: None,
exclude_header: None,
compression_level: 3,
compression_threads: 0,
}
}
fn build_opts_not_regex(
input_file: impl AsRef<Path>,
output_file: impl AsRef<Path>,
fields: &str,
no_mmap: bool,
delimiter: &str,
) -> Opts {
Opts {
input: vec![input_file.as_ref().to_path_buf()],
output: Some(output_file.as_ref().to_path_buf()),
delimiter: delimiter.to_string(),
delim_is_literal: true,
output_delimiter: "\t".to_owned(),
use_input_delim: false,
fields: Some(fields.to_owned()),
header_field: None,
header_is_regex: true,
try_decompress: false,
try_compress: false,
no_mmap,
crlf: false,
exclude: None,
exclude_header: None,
compression_level: 3,
compression_threads: 0,
}
}
#[allow(clippy::too_many_arguments)]
fn build_opts_generic(
input_file: impl AsRef<Path>,
output_file: impl AsRef<Path>,
fields: Option<&str>,
header_field: Option<Vec<Regex>>,
exclude: Option<&str>,
no_mmap: bool,
delimiter: &str,
delim_is_literal: bool,
header_is_regex: bool,
) -> Opts {
Opts {
input: vec![input_file.as_ref().to_path_buf()],
output: Some(output_file.as_ref().to_path_buf()),
delimiter: delimiter.to_string(),
delim_is_literal,
output_delimiter: "\t".to_owned(),
use_input_delim: false,
fields: fields.map(|f| f.to_owned()),
header_field,
header_is_regex,
try_decompress: false,
try_compress: false,
no_mmap,
crlf: false,
exclude: exclude.map(|e| e.to_owned()),
exclude_header: None,
compression_threads: 0,
compression_level: 3,
}
}
fn read_tsv(path: impl AsRef<Path>) -> Vec<Vec<String>> {
let reader = BufReader::new(File::open(path).unwrap());
let mut result = vec![];
let r = Regex::new(r"\s+").unwrap();
for line in reader.byte_lines() {
let line = &line.unwrap();
result.push(
r.split(line)
.map(|s| unsafe { String::from_utf8_unchecked(s.to_vec()) })
.collect(),
);
}
result
}
fn write_file(path: impl AsRef<Path>, data: Vec<Vec<&str>>, sep: &str) {
let mut writer = BufWriter::new(File::create(path).unwrap());
for row in data {
writeln!(&mut writer, "{}", row.join(sep)).unwrap();
}
writer.flush().unwrap();
}
fn run_wrapper<P: AsRef<Path>>(input: P, output: P, opts: &Opts) {
let conf = CoreConfigBuilder::new()
.delimiter(opts.delimiter.as_bytes())
.is_regex_parser(!opts.delim_is_literal)
.mmap(if opts.no_mmap {
MmapChoice::never()
} else {
unsafe { MmapChoice::auto() }
})
.output_delimiter(opts.output_delimiter.as_bytes())
.headers(opts.header_field.as_deref())
.fields(opts.fields.as_deref())
.exclude(opts.exclude.as_deref())
.exclude_headers(opts.exclude_header.as_deref())
.header_is_regex(opts.header_is_regex)
.build()
.unwrap();
let mut line_buffer = LineBufferBuilder::new().build();
let mut writer = BufWriter::new(File::create(output).unwrap());
run(
HckInput::Path(input.as_ref().to_owned()),
&mut writer,
&conf,
&mut line_buffer,
)
.unwrap();
}
const FOURSPACE: &str = " ";
#[rstest]
fn test_exclude_one(
#[values(true, false)] no_mmap: bool,
#[values(r" ", " ")] hck_delim: &str,
#[values(true, false)] delim_is_literal: bool,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_generic(
&input_file,
&output_file,
Some("1,3"),
None,
Some("3"),
no_mmap,
hck_delim,
delim_is_literal,
false,
);
let data = vec![vec!["a", "b", "c"], vec!["1", "2", "3"]];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec!["a",], vec!["1"]]);
}
#[rstest]
fn test_exclude_range_overlap_front(
#[values(true, false)] no_mmap: bool,
#[values(r" ", " ")] hck_delim: &str,
#[values(true, false)] delim_is_literal: bool,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_generic(
&input_file,
&output_file,
Some("3-"),
None,
Some("-5"),
no_mmap,
hck_delim,
delim_is_literal,
false,
);
let data = vec![
vec!["a", "b", "c", "d", "e", "f"],
vec!["1", "2", "3", "4", "5", "6"],
];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec!["f",], vec!["6"]]);
}
#[rstest]
fn test_exclude_range_overlap_back(
#[values(true, false)] no_mmap: bool,
#[values(r" ", " ")] hck_delim: &str,
#[values(true, false)] delim_is_literal: bool,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_generic(
&input_file,
&output_file,
Some("2-5"),
None,
Some("3-"),
no_mmap,
hck_delim,
delim_is_literal,
false,
);
let data = vec![
vec!["a", "b", "c", "d", "e", "f"],
vec!["1", "2", "3", "4", "5", "6"],
];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec!["b",], vec!["2"]]);
}
#[rstest]
fn test_exclude_range_split_fields(
#[values(true, false)] no_mmap: bool,
#[values(r" ", " ")] hck_delim: &str,
#[values(true, false)] delim_is_literal: bool,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_generic(
&input_file,
&output_file,
Some("1-"),
None,
Some("3-5"),
no_mmap,
hck_delim,
delim_is_literal,
false,
);
let data = vec![
vec!["a", "b", "c", "d", "e", "f"],
vec!["1", "2", "3", "4", "5", "6"],
];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec!["a", "b", "f"], vec!["1", "2", "6"]]);
}
#[rstest]
fn test_exclude_range_all(
#[values(true, false)] no_mmap: bool,
#[values(r" ", " ")] hck_delim: &str,
#[values(true, false)] delim_is_literal: bool,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_generic(
&input_file,
&output_file,
Some("4,3"),
None,
Some("2-5"),
no_mmap,
hck_delim,
delim_is_literal,
false,
);
let data = vec![
vec!["a", "b", "c", "d", "e", "f"],
vec!["1", "2", "3", "4", "5", "6"],
];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert!(filtered.is_empty());
}
#[rstest]
fn test_exclude_range_split_fields_reorder(
#[values(true, false)] no_mmap: bool,
#[values(r" ", " ")] hck_delim: &str,
#[values(true, false)] delim_is_literal: bool,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_generic(
&input_file,
&output_file,
Some("4-6,1-3"),
None,
Some("3-5"),
no_mmap,
hck_delim,
delim_is_literal,
false,
);
let data = vec![
vec!["a", "b", "c", "d", "e", "f"],
vec!["1", "2", "3", "4", "5", "6"],
];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec!["f", "a", "b"], vec!["6", "1", "2"]]);
}
#[rstest]
fn test_headers_simple(
#[values(true, false)] no_mmap: bool,
#[values(r" ", " ")] hck_delim: &str,
#[values(true, false)] delim_is_literal: bool,
#[values(true, false)] header_is_regex: bool,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_generic(
&input_file,
&output_file,
None,
Some(vec![Regex::new("a").unwrap()]),
None,
no_mmap,
hck_delim,
delim_is_literal,
header_is_regex,
);
let data = vec![
vec!["a", "b", "c", "d", "e", "f"],
vec!["1", "2", "3", "4", "5", "6"],
];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec!["a"], vec!["1"]]);
}
#[rstest]
fn test_headers_simple2(
#[values(true, false)] no_mmap: bool,
#[values(r" ", " ")] hck_delim: &str,
#[values(true, false)] delim_is_literal: bool,
#[values(true, false)] header_is_regex: bool,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_generic(
&input_file,
&output_file,
None,
Some(vec![Regex::new("a").unwrap(), Regex::new("c").unwrap()]),
None,
no_mmap,
hck_delim,
delim_is_literal,
header_is_regex,
);
let data = vec![vec!["a", "b", "c"], vec!["1", "2", "3"]];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec!["a", "c"], vec!["1", "3"]]);
}
#[rstest]
fn test_duplicate_field_selection_more(
#[values(true, false)] no_mmap: bool,
#[values(r" ", " ")] hck_delim: &str,
#[values(true, false)] delim_is_literal: bool,
#[values(true, false)] header_is_regex: bool,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_generic(
&input_file,
&output_file,
Some("3,3,1,2"),
None,
None,
no_mmap,
hck_delim,
delim_is_literal,
header_is_regex,
);
let data = vec![vec!["a", "b", "c", "d", "e"], vec!["1", "2", "3", "4", "5"]];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec!["c", "a", "b"], vec!["3", "1", "2"]]);
}
#[rstest]
fn test_duplicate_field_selection_range(
#[values(true, false)] no_mmap: bool,
#[values(r" ", " ")] hck_delim: &str,
#[values(true, false)] delim_is_literal: bool,
#[values(true, false)] header_is_regex: bool,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_generic(
&input_file,
&output_file,
Some("2-3,5,1,2-4"),
None,
None,
no_mmap,
hck_delim,
delim_is_literal,
header_is_regex,
);
let data = vec![vec!["a", "b", "c", "d", "e"], vec!["1", "2", "3", "4", "5"]];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(
filtered,
vec![vec!["b", "c", "e", "a", "d"], vec!["2", "3", "5", "1", "4"]]
);
}
#[rstest]
fn test_headers_and_fields(
#[values(true, false)] no_mmap: bool,
#[values(r" ", " ")] hck_delim: &str,
#[values(true, false)] delim_is_literal: bool,
#[values(true, false)] header_is_regex: bool,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_generic(
&input_file,
&output_file,
Some("3"),
Some(vec![Regex::new("b").unwrap(), Regex::new("a").unwrap()]),
None,
no_mmap,
hck_delim,
delim_is_literal,
header_is_regex,
);
let data = vec![vec!["a", "b", "c", "d", "e"], vec!["1", "2", "3", "4", "5"]];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec!["b", "c", "a"], vec!["2", "3", "1"]]);
}
#[rstest]
fn test_duplicate_field_selection(
#[values(true, false)] no_mmap: bool,
#[values(r" ", " ")] hck_delim: &str,
#[values(true, false)] delim_is_literal: bool,
#[values(true, false)] header_is_regex: bool,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_generic(
&input_file,
&output_file,
Some("3,1,3"),
None,
None,
no_mmap,
hck_delim,
delim_is_literal,
header_is_regex,
);
let data = vec![vec!["a", "b", "c", "d"], vec!["1", "2", "3", "4"]];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec!["c", "a"], vec!["3", "1"]]);
}
#[rstest]
#[rustfmt::skip::macros(vec)]
fn test_read_single_values(
#[values(true, false)] no_mmap: bool,
#[values(" ", " ")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts(&input_file, &output_file, "1", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c"],
vec!["1", "2", "3"],
];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec!["a"], vec!["1"]]);
}
#[rstest]
fn test_read_several_single_values(
#[values(true, false)] no_mmap: bool,
#[values(r"\s+")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts(&input_file, &output_file, "1,3", no_mmap, hck_delim);
let data = vec![vec!["a", "b", "c"], vec!["1", "2", "3"]];
write_file(&input_file, data, FOURSPACE);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec!["a", "c"], vec!["1", "3"]]);
}
#[rstest]
fn test_read_several_single_values_with_invalid_utf8(
#[values(true, false)] no_mmap: bool,
#[values(r"\s+")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts(&input_file, &output_file, "1,3", no_mmap, hck_delim);
let bad_str = unsafe { String::from_utf8_unchecked(b"a\xED\xA0\x80z".to_vec()) };
let data = vec![vec![bad_str.as_str(), "b", "c"], vec!["1", "2", "3"]];
write_file(&input_file, data, FOURSPACE);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec![bad_str.as_str(), "c"], vec!["1", "3"]]);
}
#[rstest]
fn test_read_single_range(
#[values(true, false)] no_mmap: bool,
#[values(r"\s+")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts(&input_file, &output_file, "2-", no_mmap, hck_delim);
let data = vec![vec!["a", "b", "c", "d"], vec!["1", "2", "3", "4"]];
write_file(&input_file, data, FOURSPACE);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec!["b", "c", "d"], vec!["2", "3", "4"]]);
}
#[rstest]
fn test_read_serveral_range(
#[values(true, false)] no_mmap: bool,
#[values(r"\s+")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts(&input_file, &output_file, "2-4,6-", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c", "d", "e", "f", "g"],
vec!["1", "2", "3", "4", "5", "6", "7"],
];
write_file(&input_file, data, FOURSPACE);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(
filtered,
vec![vec!["b", "c", "d", "f", "g"], vec!["2", "3", "4", "6", "7"]]
);
}
#[rstest]
fn test_read_mixed_fields1(
#[values(true, false)] no_mmap: bool,
#[values(r"\s+")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts(&input_file, &output_file, "2,4-", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c", "d", "e", "f", "g"],
vec!["1", "2", "3", "4", "5", "6", "7"],
];
write_file(&input_file, data, FOURSPACE);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(
filtered,
vec![vec!["b", "d", "e", "f", "g"], vec!["2", "4", "5", "6", "7"]]
);
}
#[rstest]
fn test_read_mixed_fields2(
#[values(true, false)] no_mmap: bool,
#[values(r"\s+")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts(&input_file, &output_file, "-4,7", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c", "d", "e", "f", "g"],
vec!["1", "2", "3", "4", "5", "6", "7"],
];
write_file(&input_file, data, FOURSPACE);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(
filtered,
vec![vec!["a", "b", "c", "d", "g"], vec!["1", "2", "3", "4", "7"]]
);
}
#[rstest]
fn test_read_no_delimis_found(
#[values(true, false)] no_mmap: bool,
#[values(r"\s+")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts(&input_file, &output_file, "-4,7", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c", "d", "e", "f", "g"],
vec!["1", "2", "3", "4", "5", "6", "7"],
];
write_file(&input_file, data, "-");
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec!["a-b-c-d-e-f-g"], vec!["1-2-3-4-5-6-7"]]);
}
#[rstest]
fn test_read_over_end(#[values(true, false)] no_mmap: bool, #[values(r"\s+")] hck_delim: &str) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts(&input_file, &output_file, "-4,8,11-", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c", "d", "e", "f", "g"],
vec!["1", "2", "3", "4", "5", "6", "7"],
];
write_file(&input_file, data, FOURSPACE);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(
filtered,
vec![vec!["a", "b", "c", "d"], vec!["1", "2", "3", "4"]]
);
}
#[rstest]
fn test_reorder1(#[values(true, false)] no_mmap: bool, #[values(r"\s+")] hck_delim: &str) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts(&input_file, &output_file, "6,-4", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c", "d", "e", "f", "g"],
vec!["1", "2", "3", "4", "5", "6", "7"],
];
write_file(&input_file, data, FOURSPACE);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(
filtered,
vec![vec!["f", "a", "b", "c", "d"], vec!["6", "1", "2", "3", "4"]]
);
}
#[rstest]
fn test_reorder_merged_range(
#[values(true, false)] no_mmap: bool,
#[values(r"\s+")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts(&input_file, &output_file, "1,3,2,7,6", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c", "d", "e", "f", "g"],
vec!["1", "2", "3", "4", "5", "6", "7"],
];
write_file(&input_file, data, FOURSPACE);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(
filtered,
vec![vec!["a", "c", "b", "g", "f"], vec!["1", "3", "2", "7", "6"]]
);
}
#[rstest]
fn test_reorder2(#[values(true, false)] no_mmap: bool, #[values(r"\s+")] hck_delim: &str) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts(&input_file, &output_file, "3-,1,4-5", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c", "d", "e", "f", "g"],
vec!["1", "2", "3", "4", "5", "6", "7"],
];
write_file(&input_file, data, FOURSPACE);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(
filtered,
vec![
vec!["c", "d", "e", "f", "g", "a"],
vec!["3", "4", "5", "6", "7", "1"]
]
);
}
#[rstest]
#[rustfmt::skip::macros(vec)]
fn test_read_single_values_not_regex(
#[values(true, false)] no_mmap: bool,
#[values(" ", " ")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_not_regex(&input_file, &output_file, "1", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c"],
vec!["1", "2", "3"],
];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec!["a"], vec!["1"]]);
}
#[rstest]
fn test_read_several_single_values_not_regex(
#[values(true, false)] no_mmap: bool,
#[values(" ", " ")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_not_regex(&input_file, &output_file, "1,3", no_mmap, hck_delim);
let data = vec![vec!["a", "b", "c"], vec!["1", "2", "3"]];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec!["a", "c"], vec!["1", "3"]]);
}
#[rstest]
fn test_read_several_single_values_with_invalid_utf8_not_regex(
#[values(true, false)] no_mmap: bool,
#[values(" ", " ")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_not_regex(&input_file, &output_file, "1,3", no_mmap, hck_delim);
let bad_str = unsafe { String::from_utf8_unchecked(b"a\xED\xA0\x80z".to_vec()) };
let data = vec![vec![bad_str.as_str(), "b", "c"], vec!["1", "2", "3"]];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec![bad_str.as_str(), "c"], vec!["1", "3"]]);
}
#[rstest]
fn test_read_single_range_not_regex(
#[values(true, false)] no_mmap: bool,
#[values(" ", " ")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_not_regex(&input_file, &output_file, "2-", no_mmap, hck_delim);
let data = vec![vec!["a", "b", "c", "d"], vec!["1", "2", "3", "4"]];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec!["b", "c", "d"], vec!["2", "3", "4"]]);
}
#[rstest]
fn test_read_serveral_range_not_regex(
#[values(true, false)] no_mmap: bool,
#[values(" ", " ")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_not_regex(&input_file, &output_file, "2-4,6-", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c", "d", "e", "f", "g"],
vec!["1", "2", "3", "4", "5", "6", "7"],
];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(
filtered,
vec![vec!["b", "c", "d", "f", "g"], vec!["2", "3", "4", "6", "7"]]
);
}
#[rstest]
fn test_read_mixed_fields1_not_regex(
#[values(true, false)] no_mmap: bool,
#[values(" ", " ")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_not_regex(&input_file, &output_file, "2,4-", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c", "d", "e", "f", "g"],
vec!["1", "2", "3", "4", "5", "6", "7"],
];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(
filtered,
vec![vec!["b", "d", "e", "f", "g"], vec!["2", "4", "5", "6", "7"]]
);
}
#[rstest]
fn test_read_mixed_fields2_not_regex(
#[values(true, false)] no_mmap: bool,
#[values(" ", " ")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_not_regex(&input_file, &output_file, "-4,7", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c", "d", "e", "f", "g"],
vec!["1", "2", "3", "4", "5", "6", "7"],
];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(
filtered,
vec![vec!["a", "b", "c", "d", "g"], vec!["1", "2", "3", "4", "7"]]
);
}
#[rstest]
fn test_read_no_delimis_found_not_regex(
#[values(true, false)] no_mmap: bool,
#[values(" ", " ")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_not_regex(&input_file, &output_file, "-4,7", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c", "d", "e", "f", "g"],
vec!["1", "2", "3", "4", "5", "6", "7"],
];
write_file(&input_file, data, "-");
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec!["a-b-c-d-e-f-g"], vec!["1-2-3-4-5-6-7"]]);
}
#[rstest]
fn test_read_over_end_not_regex(
#[values(true, false)] no_mmap: bool,
#[values(" ", " ")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_not_regex(&input_file, &output_file, "-4,8,11-", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c", "d", "e", "f", "g"],
vec!["1", "2", "3", "4", "5", "6", "7"],
];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(
filtered,
vec![vec!["a", "b", "c", "d"], vec!["1", "2", "3", "4"]]
);
}
#[rstest]
fn test_reorder1_not_regex(
#[values(true, false)] no_mmap: bool,
#[values(" ", " ")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_not_regex(&input_file, &output_file, "6,-4", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c", "d", "e", "f", "g"],
vec!["1", "2", "3", "4", "5", "6", "7"],
];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(
filtered,
vec![vec!["f", "a", "b", "c", "d"], vec!["6", "1", "2", "3", "4"]]
);
}
#[rstest]
fn test_reorder2_not_regex(
#[values(true, false)] no_mmap: bool,
#[values(" ", " ")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_not_regex(&input_file, &output_file, "3-,1,4-5", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c", "d", "e", "f", "g"],
vec!["1", "2", "3", "4", "5", "6", "7"],
];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(
filtered,
vec![
vec!["c", "d", "e", "f", "g", "a"],
vec!["3", "4", "5", "6", "7", "1"]
]
);
}
#[rstest]
fn test_reorder_no_split_found(
#[values(true, false)] no_mmap: bool,
#[values(" ", " ")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_not_regex(&input_file, &output_file, "3-,1,4-5", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c", "d", "e", "f", "g"],
vec!["1", "2", "3", "4", "5", "6", "7"],
];
write_file(&input_file, data, "-");
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(filtered, vec![vec!["a-b-c-d-e-f-g"], vec!["1-2-3-4-5-6-7"]]);
}
#[rstest]
fn test_reorder_no_split_found_regex(
#[values(true, false)] no_mmap: bool,
#[values(" ", " ")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts(&input_file, &output_file, "3-,1,4-5", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c", "d", "e", "f", "g"],
vec!["1", "2", "3", "4", "5", "6", "7"],
];
write_file(&input_file, data, "---");
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(
filtered,
vec![
vec!["a---b---c---d---e---f---g"],
vec!["1---2---3---4---5---6---7"]
]
);
}
#[rstest]
fn test_issue_12_with_regex(
#[values(true, false)] no_mmap: bool,
#[values(r"\s+")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts(&input_file, &output_file, "2,3,4-", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c", "d", "e", "f", "g"],
vec!["1", "2", "3", "4", "5", "6", "7"],
];
write_file(&input_file, data, " ");
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(
filtered,
vec![
vec!["b", "c", "d", "e", "f", "g"],
vec!["2", "3", "4", "5", "6", "7"]
]
);
}
#[rstest]
fn test_issue_12_no_regex(
#[values(true, false)] no_mmap: bool,
#[values(" ", " ")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts(&input_file, &output_file, "2,3,4-", no_mmap, hck_delim);
let data = vec![
vec!["a", "b", "c", "d", "e", "f", "g"],
vec!["1", "2", "3", "4", "5", "6", "7"],
];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(
filtered,
vec![
vec!["b", "c", "d", "e", "f", "g"],
vec!["2", "3", "4", "5", "6", "7"]
]
);
}
#[rstest]
fn test_issue_38_not_regex(
#[values(true, false)] no_mmap: bool,
#[values(" ", " ")] hck_delim: &str,
) {
let tmp = TempDir::new().unwrap();
let input_file = tmp.path().join("input.txt");
let output_file = tmp.path().join("output.txt");
let opts = build_opts_not_regex(&input_file, &output_file, "1,2", no_mmap, hck_delim);
let data = vec![
vec![""],
vec![""],
vec!["a", "b", "c", "d", "e", "f", "g"],
vec![""],
vec![""],
vec!["1", "2", "3", "4", "5", "6", "7"],
];
write_file(&input_file, data, hck_delim);
run_wrapper(&input_file, &output_file, &opts);
let filtered = read_tsv(output_file);
assert_eq!(
filtered,
vec![
vec![""],
vec![""],
vec!["a", "b"],
vec![""],
vec![""],
vec!["1", "2"]
]
);
}
}