use std::collections::HashSet;
use aho_corasick::AhoCorasick;
use bstr::ByteSlice;
use csv;
use regex::bytes::{RegexBuilder, RegexSetBuilder};
use crate::config::{Config, Delimiter};
use crate::select::SelectColumns;
use crate::util;
use crate::CliError;
use crate::CliResult;
enum Matcher {
NonEmpty,
Substring(AhoCorasick, bool),
Exact(Vec<u8>, bool),
Regex(regex::bytes::Regex),
ManyRegex(regex::bytes::RegexSet),
ManyExact(HashSet<Vec<u8>>, bool),
}
impl Matcher {
fn is_match(&self, cell: &[u8]) -> bool {
match self {
Self::NonEmpty => !cell.is_empty(),
Self::Substring(pattern, case_insensitive) => {
if *case_insensitive {
pattern.is_match(&cell.to_lowercase())
} else {
pattern.is_match(cell)
}
}
Self::Regex(pattern) => pattern.is_match(cell),
Self::Exact(pattern, case_insensitive) => {
if *case_insensitive {
&cell.to_lowercase() == pattern
} else {
cell == pattern
}
}
Self::ManyRegex(set) => set.is_match(cell),
Self::ManyExact(patterns, case_insensitive) => {
if *case_insensitive {
patterns.contains(&cell.to_lowercase())
} else {
patterns.contains(cell)
}
}
}
}
}
static USAGE: &str = "
Filter rows of given CSV file if some of its cells contains a desired substring.
Can also be used to search for exact matches using the -e, --exact flag.
Can also be used to search using a regular expression using the -r, --regex flag.
When using a regular expression, be sure to mind bash escape rules (prefer single
quotes around your expression and don't forget to use backslashes when needed):
$ xan search -r '\\bfran[cç]' file.csv
To restrict the columns that will be searched you can use the -s, --select flag.
All search modes can also be case-insensitive using -i, --ignore-case.
Finally, this command is also able to take a CSV file column containing multiple
patterns to search for at once, using the --input flag:
$ xan search --input user-ids.csv user_id tweets.csv
Usage:
xan search [options] --non-empty [<input>]
xan search [options] --input <index> <column> [<input>]
xan search [options] <pattern> [<input>]
xan search --help
search options:
-e, --exact Perform an exact match.
-r, --regex Use a regex to perform the match.
-N, --non-empty Search for non-empty cells, i.e. filter out
any completely empty selection.
--input <index> CSV file containing a column of value to index & search.
-i, --ignore-case Case insensitive search. This is equivalent to
prefixing the regex with '(?i)'.
-s, --select <arg> Select the columns to search. See 'xan select -h'
for the full syntax.
-v, --invert-match Select only rows that did not match
-f, --flag <column> If given, the command will not filter rows
but will instead flag the found rows in a new
column with given name.
Common options:
-h, --help Display this message
-o, --output <file> Write output to <file> instead of stdout.
-n, --no-headers When set, the first row will not be interpreted
as headers. (i.e., They are not searched, analyzed,
sliced, etc.)
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character.
";
#[derive(Deserialize)]
struct Args {
arg_input: Option<String>,
arg_pattern: Option<String>,
arg_column: Option<SelectColumns>,
flag_select: SelectColumns,
flag_output: Option<String>,
flag_no_headers: bool,
flag_delimiter: Option<Delimiter>,
flag_invert_match: bool,
flag_ignore_case: bool,
flag_non_empty: bool,
flag_exact: bool,
flag_regex: bool,
flag_flag: Option<String>,
flag_input: Option<String>,
}
impl Args {
fn get_matcher(&self) -> Result<Matcher, CliError> {
if self.flag_non_empty {
return Ok(Matcher::NonEmpty);
}
match self.arg_column.as_ref() {
None => {
let pattern = self.arg_pattern.as_ref().unwrap();
Ok(if self.flag_exact {
if self.flag_ignore_case {
Matcher::Exact(pattern.as_bytes().to_lowercase(), true)
} else {
Matcher::Exact(pattern.as_bytes().to_vec(), false)
}
} else if self.flag_regex {
Matcher::Regex(
RegexBuilder::new(pattern)
.case_insensitive(self.flag_ignore_case)
.build()?,
)
} else {
Matcher::Substring(
AhoCorasick::new([if self.flag_ignore_case {
pattern.to_lowercase()
} else {
pattern.to_string()
}])?,
self.flag_ignore_case,
)
})
}
Some(column) => {
let rconf = Config::new(&self.flag_input)
.delimiter(self.flag_delimiter)
.select(column.clone());
let mut rdr = rconf.reader()?;
let headers = rdr.byte_headers()?;
let column_index = rconf.single_selection(headers)?;
let mut record = csv::ByteRecord::new();
let mut set: HashSet<Vec<u8>> = HashSet::new();
let mut patterns: Vec<String> = Vec::new();
while rdr.read_byte_record(&mut record)? {
let pattern = &record[column_index];
if self.flag_exact {
if self.flag_ignore_case {
set.insert(pattern.to_lowercase());
} else {
set.insert(pattern.to_vec());
}
} else {
patterns.push(std::str::from_utf8(pattern).unwrap().to_string());
}
}
Ok(if self.flag_exact {
Matcher::ManyExact(set, self.flag_ignore_case)
} else if self.flag_regex {
Matcher::ManyRegex(
RegexSetBuilder::new(&patterns)
.case_insensitive(self.flag_ignore_case)
.build()?,
)
} else {
Matcher::Substring(AhoCorasick::new(&patterns)?, self.flag_ignore_case)
})
}
}
}
}
pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
let mut matchers_count: u8 = 0;
if args.flag_exact {
matchers_count += 1;
}
if args.flag_regex {
matchers_count += 1;
}
if args.flag_non_empty {
matchers_count += 1;
}
if matchers_count > 1 {
Err("must select only one of -e/--exact, -N,--non-empty, -r,--regex!")?;
}
let matcher = args.get_matcher()?;
let rconfig = Config::new(&args.arg_input)
.delimiter(args.flag_delimiter)
.no_headers(args.flag_no_headers)
.select(args.flag_select);
let mut rdr = rconfig.reader()?;
let mut wtr = Config::new(&args.flag_output).writer()?;
let mut headers = rdr.byte_headers()?.clone();
let sel = rconfig.selection(&headers)?;
if let Some(column_name) = args.flag_flag.clone() {
headers.push_field(column_name.as_bytes());
}
if !rconfig.no_headers {
wtr.write_record(&headers)?;
}
let mut record = csv::ByteRecord::new();
while rdr.read_byte_record(&mut record)? {
let mut is_match = sel.select(&record).any(|cell| matcher.is_match(cell));
if args.flag_invert_match {
is_match = !is_match;
}
if args.flag_flag.is_some() {
record.push_field(if is_match { b"1" } else { b"0" });
wtr.write_byte_record(&record)?;
} else if is_match {
wtr.write_byte_record(&record)?;
}
}
Ok(wtr.flush()?)
}