#![doc = include_str!("../docs/parse.md")]
use clap::ArgAction;
use clap::{Arg, Command};
use indicatif::ProgressBar;
use polars::prelude::*;
use rand::rngs::StdRng;
use rand::seq::SliceRandom as _;
use rand::SeedableRng;
use anyhow::{anyhow, bail, ensure, Context, Error, Result};
use std::iter::FromIterator as _;
use std::vec;
use std::{collections::HashSet, fmt::Write, io::Write as IOWrite, sync::Mutex};
use tracing::info;
use tree_sitter::{Language, Node, Parser, Tree};
use crate::utils::fs::*;
use crate::utils::regex::*;
use crate::utils::{
csv::*,
logger::{log_output_file, log_seed, Logger},
};
pub fn cli() -> Command {
Command::new("parse")
.about("Parse all the files in the dataset and extract functions whose body contains one of the provided keywords.")
.long_about(include_str!("../docs/parse.md"))
.disable_version_flag(true)
.arg(
Arg::new("input")
.short('i')
.long("input")
.value_name("INPUT_FILE.csv")
.help("Path to the input csv file to use. It must be a valid CSV file where the first column is the path to the file and the \
second column is the extension of the file. Other columns are ignored.")
.required(true)
)
.arg(
Arg::new("output")
.short('o')
.long("output")
.value_name("OUTPUT_FILE.csv")
.help("Path to the output csv file storing the functions statistics.")
.required(false),
)
.arg(
Arg::new("logs")
.short('l')
.long("logs")
.value_name("LOGS_FOLDER")
.help("Path to the folder where the logs are stored. The default is the current folder.")
.required(false),
)
.arg(
Arg::new("keywords")
.short('k')
.long("keywords")
.num_args(1..)
.action(ArgAction::Append)
.value_name("KEYWORDS_FILES.json")
.help("List of files containing the list of extensions and keywords to use. The files must be in JSON format.\n\
The extensions should be written without the period (`java` instead of `.java`). The files must have the following structure:\n \
{\n\
\"languages\": [\n\
{\n\
\"name\": \"LanguageName\",\n\
\"extensions\": [\".ext1\", \".ext2\", ...],\n\
\"keywords\": [\"localKeyword1\", \"localKeyword2\", ...] // optional\n\
},\n\
...\n\
],\n\
\"keywords\": [\"globalKeyword1\", \"globalKeyword2\", ...] // optional\n\
}")
.required(true)
)
.arg(
Arg::new("lang")
.long("lang")
.num_args(1..)
.action(ArgAction::Append)
.value_name("LANGUAGES")
.help("List of languages to parse. The supported languages are C, C++, C#, Fortran, Go, Java, Python and Typescript.")
.required(false)
)
.arg(
Arg::new("force")
.short('f')
.long("force")
.help("Override the output file if it already exists.")
.default_value("false")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("threads")
.short('n')
.help("Number of threads to use.")
.default_value("1")
.value_parser(clap::value_parser!(usize))
)
.arg(
Arg::new("seed")
.short('s')
.long("seed")
.value_name("SEED")
.help("Seed used to randomly shuffle the input file.")
.default_value("8155495201244430235")
.value_parser(clap::value_parser!(u64)),
)
.arg(
Arg::new("failures")
.long("failures")
.value_name("POLICY")
.help("Failure policy when a file or a function has a parsing error.\n\
ignore: continue parsing\n\
skip-file: replace the file statistics with an error row in the output file, does not extract any function from the file\n\
skip-function: replace the function statistics with an error row in the output file\n\
abort: stop the program")
.default_value("ignore")
.value_parser(["ignore", "skip-file", "skip-function", "abort"]),
)
}
pub fn run(
input_path: &str,
output_path: Option<&str>,
logs_path: Option<&str>,
keywords_file_paths: &[&str],
opt_languages: Option<Vec<&str>>,
fail_policy: &str,
threads: usize,
seed: u64,
force: bool,
logger: &Logger,
) -> Result<()> {
let supported_languages: HashSet<&'static str> = vec![
"c",
"c++",
"c#",
"java",
"python",
"fortran",
"typescript",
"go",
"scala",
]
.into_iter()
.collect::<HashSet<_>>();
let languages: Vec<&str> = match opt_languages {
Some(l) => {
for lang in l.iter() {
ensure!(
supported_languages.contains(lang),
"Unsupported language: {lang}"
);
}
l
}
None => {
info!("No language specified, using all supported languages");
supported_languages.into_iter().collect()
}
};
let languages_series = Series::new(
"language_filter".into(),
languages
.iter()
.map(|x| x.to_string())
.collect::<Vec<String>>(),
);
let default_output_path: String = format!("{input_path}.functions.csv");
let output_path: &str = output_path.unwrap_or(&default_output_path);
log_output_file(output_path, false, force)?;
let default_logs_path: String = format!("{input_path}.function_logs.csv");
let logs_path: &str = logs_path.unwrap_or(&default_logs_path);
log_output_file(logs_path, false, force)?;
let mut input_file = open_csv(
input_path,
Some(Schema::from_iter(vec![
Field::new("id".into(), DataType::UInt32),
Field::new("name".into(), DataType::String),
Field::new("language".into(), DataType::String),
])),
Some(vec!["id", "name", "language"]),
)?;
let n_files_before = input_file.height();
info!(
" {} files found in the input file, filtering by selected languages",
n_files_before
);
input_file = input_file
.lazy()
.filter(col("language").is_in(lit(languages_series)))
.collect()?;
let n_files = input_file.height();
info!(
" {} files found after filtering ({:.2} %)",
n_files,
if n_files_before == 0 {
0
} else {
n_files / n_files_before * 100
}
);
log_seed(seed);
let mut shuffled_idx = (0..input_file.height()).collect::<Vec<usize>>();
logger.run_task("Loading files in random order", || {
let mut rng: StdRng = SeedableRng::seed_from_u64(seed);
shuffled_idx.shuffle(&mut rng);
Ok(())
})?;
let shuffled_rows = shuffled_idx.into_iter().map(|idx| {
let row = input_file.get_row(idx).unwrap().0;
match (row[0].clone(), row[1].clone(), row[2].clone()) {
(AnyValue::UInt32(id), AnyValue::String(path), AnyValue::String(lang)) => Ok((
id,
path.replace("-was_comma-", ",")
.replace("-was_quote-", "\""),
lang,
)),
_ => Err(idx),
}
});
const OUTPUT_COLS: usize = 17;
const LOGS_COLS: usize = 7;
let keyword_files: KeywordFiles = logger.run_task("Loading keywords", || {
KeywordFiles::new().add_files(keywords_file_paths, true)
})?;
let keyword_match_headers: String = keyword_files.paths.join(",");
let word_counter: Matcher = Matcher::words_matcher();
let mut output_file = CSVFile::new(output_path, FileMode::Overwrite)?;
let header: [&str; OUTPUT_COLS] = [
"id",
"path",
"name",
"position",
"language",
"loc",
"words",
&keyword_match_headers,
"loop_statements",
"loop_nestings",
"if_statements",
"if_nestings",
"functions_calls",
"function_calls_nestings",
"params",
"param_kw_match",
"parse_error",
];
output_file.write_header(&header)?;
let mut logs_file = CSVFile::new(logs_path, FileMode::Overwrite)?;
let logs_header: [&str; LOGS_COLS] = [
"id",
"name",
"language",
"functions",
"functions_with_kw",
&keyword_match_headers,
"parse_error",
];
logs_file.write_header(&logs_header)?;
let iter = Mutex::new(shuffled_rows.into_iter());
let (tx, rx) =
crossbeam_channel::unbounded::<Option<Result<(String, Option<String>), Error>>>();
crossbeam::thread::scope(|s| {
for _ in 0..threads {
s.spawn(|_| {
let my_tx = tx.clone();
loop {
let next_item: Option<Result<(u32, String, &str), usize>> = {
let mut iter_guard = iter.lock().unwrap();
iter_guard.next()
};
match next_item {
Some(row) => match row {
Ok((project_id, file_name, language)) => match analyze_file(
project_id,
&file_name,
language,
&keyword_files,
fail_policy,
&word_counter,
) {
Ok(s) => {
my_tx.send(Some(Ok(s))).unwrap();
}
Err(e) => {
my_tx.send(Some(Err(e))).unwrap();
break;
}
},
Err(row_nr) => {
let _ =
my_tx.send(Some(Err(anyhow!("Could not parse row {row_nr}"))));
}
},
None => {
my_tx.send(None).unwrap();
break;
}
}
}
});
}
let mut ended_threads = 0;
let progress = ProgressBar::new(n_files as u64);
progress.set_style(
indicatif::ProgressStyle::default_bar().template("{elapsed} {wide_bar} {percent}%")?,
);
while let Ok(msg) = rx.recv() {
match msg {
Some(msg_content) => {
let (output, opt_log) = msg_content?;
write!(&mut output_file, "{output}")?;
if let Some(log) = opt_log {
writeln!(&mut logs_file, "{log}")?;
}
progress.inc(1);
}
None => {
ended_threads += 1;
if ended_threads == threads {
break;
}
}
}
}
progress.finish();
Ok(())
})
.map_err(|e| anyhow!("Error in thread pool: {e:?}"))?
}
fn analyze_file(
project_id: u32,
path: &str,
language: &str,
keywords_files: &KeywordFiles,
fail_policy: &str,
word_counter: &Matcher,
) -> Result<(String, Option<String>)> {
let grammar = language_to_grammar(language)
.with_context(|| format!("Unsupported language: {language}"))?;
let mut parser: Parser = Parser::new();
parser.set_language(&grammar.lang)?;
match load_file(path, 1024 * 1024 * 1024)? {
Ok(source_code) => {
let target_folder: String = format!("{path}.functions");
create_dir(&target_folder)?;
let tree: Tree = parser
.parse(&source_code, None)
.with_context(|| format!("Failed to parse file {path}"))?;
let file_has_parse_error: bool = tree.root_node().has_error();
if file_has_parse_error && fail_policy == "skip-file" {
Ok((String::new(), None))
} else if file_has_parse_error && fail_policy == "abort" {
bail!("Parse error in file {path}")
} else {
let root: Node<'_> = tree.root_node();
let (output, total_functions, functions_with_kw, functions_with_specific_kw) =
extract_functions(
project_id,
&root,
&target_folder,
language,
&grammar,
&source_code,
keywords_files,
fail_policy,
word_counter,
&mut parser,
)?;
let error_position: String = if file_has_parse_error {
position_to_string(find_first_error_position(&root))
} else {
"none".to_string()
};
Ok((
output,
Some(format!(
"{},{},{},{},{},{},{}",
project_id,
path.replace(",", "-was_comma-")
.replace("\"", "-was_quote-"),
language,
total_functions,
functions_with_kw,
functions_with_specific_kw
.iter()
.map(|x| x.to_string())
.collect::<Vec<String>>()
.join(","),
error_position,
)),
))
}
}
Err(_) => Ok((
String::new(),
Some(file_error_row(
project_id,
path,
language,
keywords_files,
"none",
)),
)),
}
}
fn file_error_row(
project_id: u32,
path: &str,
language: &str,
keyword_files: &KeywordFiles,
parse_error: &str,
) -> String {
format!(
"{},{},{},-1,-1,{},{}",
project_id,
path.replace(",", "-was_comma-")
.replace("\"", "-was_quote-"),
language,
keyword_files
.paths
.iter()
.map(|_| "-1".to_string())
.collect::<Vec<String>>()
.join(","),
parse_error,
)
}
fn extract_functions(
project_id: u32,
root: &Node,
target_folder: &str,
language: &str,
grammar: &Grammar,
source: &[u8],
keyword_files: &KeywordFiles,
fail_policy: &str,
word_counter: &Matcher,
parser: &mut Parser,
) -> Result<(String, usize, usize, Vec<usize>), Error> {
let mut builder: String = String::new();
let mut functions: usize = 0;
let mut functions_with_kw: usize = 0;
let mut functions_with_specific_kw: Vec<usize> = vec![0; keyword_files.paths.len()];
let mut call_stack: Vec<Node> = Vec::new();
call_stack.push(*root);
let mut cursor = root.walk();
while let Some(node) = call_stack.pop() {
if grammar.function_nodes.contains(node.kind()) {
let has_error: bool = node.has_error();
if (has_error && fail_policy == "skip-function")
|| (language == "java" && find_fields(&node, "body").is_empty())
{
continue;
} else {
let function_source_code: &[u8] = node_source_code(&node, source);
let function_position: (usize, usize) = (
node.start_position().row + 1,
node.start_position().column + 1,
);
let error_position: String = if has_error {
position_to_string(find_first_error_position(&node).map(|(row, col)| {
let error_row = row - function_position.0 + 1;
if row == function_position.0 {
(error_row, col - function_position.1 + 1)
} else {
(error_row, col)
}
}))
} else {
"none".to_string()
};
let function_code_with_strings: &Vec<u8> =
&remove_kind_from_source(function_source_code, &node, &grammar.comment_nodes);
let tree_without_comments: Tree = parser
.parse(function_code_with_strings, None)
.with_context(|| {
format!("Error parsing code for function {target_folder}/{functions}")
})?;
let function_code = &remove_kind_from_source(
function_code_with_strings,
&tree_without_comments.root_node(),
&grammar.string_literal_nodes,
);
let matches: Vec<usize> =
keyword_files.count_matches_in_text(language, function_code);
if matches.iter().any(|x| *x > 0) {
let function_path: String = format!(
"{}/{}-{}",
target_folder, function_position.0, function_position.1
);
std::fs::write(&function_path, function_source_code)?;
let (loops, loop_nesting) = count_nodes_of_kind(&node, &grammar.loop_nodes);
let (conditionals, conditional_nesting) =
count_nodes_of_kind(&node, &grammar.cond_nodes);
let (calls, calls_nesting) =
count_nodes_of_kind(&node, &grammar.function_call_nodes);
let params_vec: Vec<Node<'_>> =
find_first_node_of_kind(&node, &grammar.param_seq_nodes, true);
let mut name: String = String::from_utf8_lossy(
find_first_field(&node, grammar.name_field)
.map(|n| node_source_code(&n, source))
.unwrap_or(b""),
)
.to_string();
if let Some(idx) = name.find('(') {
name.truncate(idx);
}
name = name.chars().filter(|c| !c.is_whitespace()).collect();
let mut n_param: usize = 0;
let mut param_match: usize = 0;
for params in params_vec {
let matches = match grammar.param_type_field {
Some(field) => {
find_fields(¶ms, field)
.into_iter()
.map(|x| node_source_code(&x, source))
.filter(|x| keyword_files.has_matches_in_text(language, x))
.count()
}
None => 0,
};
n_param += count_nodes_of_kind(¶ms, &grammar.param_nodes).0;
param_match += matches;
}
writeln!(
&mut builder,
"{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}",
project_id,
&function_path
.replace(",", "-was_comma-")
.replace("\"", "-was_quote-"),
name.replace(",", "-was_comma-")
.replace("\"", "-was_quote-"),
position_to_string(Some(function_position)),
language,
count_text_lines(function_code_with_strings),
word_counter.count_matches_in_text(function_code_with_strings),
matches
.iter()
.map(|x| x.to_string())
.collect::<Vec<String>>()
.join(","),
loops,
loop_nesting,
conditionals,
conditional_nesting,
calls,
calls_nesting,
n_param,
param_match,
error_position,
)?;
functions_with_kw += 1;
for (i, m) in matches.iter().enumerate() {
if *m > 0 {
functions_with_specific_kw[i] += 1;
}
}
}
functions += 1;
}
} else {
for c in node
.children(&mut cursor)
.collect::<Vec<_>>()
.into_iter()
.rev()
{
call_stack.push(c);
}
}
}
Ok((
builder,
functions,
functions_with_kw,
functions_with_specific_kw,
))
}
fn node_source_code<'a>(n: &Node, source: &'a [u8]) -> &'a [u8] {
&source[n.start_byte()..n.end_byte()]
}
struct Grammar {
lang: Language,
comment_nodes: HashSet<&'static str>,
string_literal_nodes: HashSet<&'static str>,
loop_nodes: HashSet<&'static str>,
cond_nodes: HashSet<&'static str>,
function_nodes: HashSet<&'static str>,
function_call_nodes: HashSet<&'static str>,
param_seq_nodes: HashSet<&'static str>,
param_nodes: HashSet<&'static str>,
param_type_field: Option<&'static str>,
name_field: &'static str,
}
fn c_grammar() -> Grammar {
Grammar {
lang: tree_sitter_c::LANGUAGE.into(),
comment_nodes: vec!["comment"].into_iter().collect(),
string_literal_nodes: vec!["string_literal"].into_iter().collect(),
loop_nodes: vec!["for_statement", "while_statement", "do_statement"]
.into_iter()
.collect(),
cond_nodes: vec!["if_statement", "switch_statement", "conditional_expression"]
.into_iter()
.collect(),
function_nodes: vec!["function_definition"].into_iter().collect(),
function_call_nodes: vec!["call_expression"].into_iter().collect(),
param_seq_nodes: vec!["parameter_list"].into_iter().collect(),
param_nodes: vec!["parameter_declaration"].into_iter().collect(),
param_type_field: Some("type"),
name_field: "declarator",
}
}
fn cpp_grammar() -> Grammar {
Grammar {
lang: tree_sitter_cpp::LANGUAGE.into(),
comment_nodes: vec!["comment"].into_iter().collect(),
string_literal_nodes: vec!["string_literal"].into_iter().collect(),
loop_nodes: vec!["for_range_loop", "for_statement", "while_statement"]
.into_iter()
.collect(),
cond_nodes: vec!["if_statement", "switch_statement", "conditional_expression"]
.into_iter()
.collect(),
function_nodes: vec!["function_definition", "template_declaration"]
.into_iter()
.collect(),
function_call_nodes: vec!["call_expression"].into_iter().collect(),
param_seq_nodes: vec!["parameter_list"].into_iter().collect(),
param_nodes: vec!["parameter_declaration", "variadic_parameter_declaration"]
.into_iter()
.collect(),
param_type_field: Some("type"),
name_field: "declarator",
}
}
fn cs_grammar() -> Grammar {
Grammar {
lang: tree_sitter_c_sharp::LANGUAGE.into(),
comment_nodes: vec!["comment"].into_iter().collect(),
string_literal_nodes: vec![
"string_literal",
"verbatim_string_literal",
"raw_string_literal",
]
.into_iter()
.collect(),
loop_nodes: vec!["for_statement", "while_statement", "do_statement"]
.into_iter()
.collect(),
cond_nodes: vec!["if_statement", "switch_statement", "conditional_expression"]
.into_iter()
.collect(),
function_nodes: vec![
"method_declaration",
"constructor_declaration",
"operator_declaration",
]
.into_iter()
.collect(),
function_call_nodes: vec!["invocation_expression"].into_iter().collect(),
param_seq_nodes: vec!["parameter_list"].into_iter().collect(),
param_nodes: vec!["parameter"].into_iter().collect(),
param_type_field: Some("type"),
name_field: "name",
}
}
fn ts_grammar() -> Grammar {
Grammar {
lang: tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
comment_nodes: vec!["comment"].into_iter().collect(),
string_literal_nodes: vec!["string_fragment"].into_iter().collect(),
loop_nodes: vec!["for_statement", "for_in_statement", "while_statement"]
.into_iter()
.collect(),
cond_nodes: vec!["if_statement", "switch_statement", "ternary_expression"]
.into_iter()
.collect(),
function_nodes: vec!["function_declaration", "method_definition"]
.into_iter()
.collect(),
function_call_nodes: vec![
"new_expression",
"call_expression",
"decorator_call_expression",
]
.into_iter()
.collect(),
param_seq_nodes: vec!["formal_parameters"].into_iter().collect(),
param_nodes: vec!["required_parameter", "optional_parameter"]
.into_iter()
.collect(),
param_type_field: Some("type"),
name_field: "name",
}
}
fn go_grammar() -> Grammar {
Grammar {
lang: tree_sitter_go::LANGUAGE.into(),
comment_nodes: vec!["comment"].into_iter().collect(),
string_literal_nodes: vec!["raw_string_literal", "interpreted_string_literal"]
.into_iter()
.collect(),
loop_nodes: vec!["for_statement"].into_iter().collect(),
cond_nodes: vec![
"if_statement",
"type_switch_statement",
"expression_switch_statement",
]
.into_iter()
.collect(),
function_nodes: vec!["function_declaration", "method_declaration"]
.into_iter()
.collect(),
function_call_nodes: vec!["call_expression"].into_iter().collect(),
param_seq_nodes: vec!["parameter_list"].into_iter().collect(),
param_nodes: vec!["parameter_declaration", "variadic_parameter_declaration"]
.into_iter()
.collect(),
param_type_field: Some("type"),
name_field: "name",
}
}
fn java_grammar() -> Grammar {
Grammar {
lang: tree_sitter_java::LANGUAGE.into(),
comment_nodes: vec!["line_comment", "block_comment"].into_iter().collect(),
string_literal_nodes: vec!["string_literal"].into_iter().collect(),
loop_nodes: vec![
"for_statement",
"enhanced_for_statement",
"while_statement",
"do_statement",
]
.into_iter()
.collect(),
cond_nodes: vec!["if_statement", "ternary_expression", "switch_expression"]
.into_iter()
.collect(),
function_nodes: vec!["method_declaration", "compact_constructor_declaration"]
.into_iter()
.collect(),
function_call_nodes: vec!["method_invocation", "explicit_constructor_invocation"]
.into_iter()
.collect(),
param_seq_nodes: vec!["formal_parameters"].into_iter().collect(),
param_nodes: vec!["formal_parameter"].into_iter().collect(),
param_type_field: Some("type"),
name_field: "name",
}
}
fn scala_grammar() -> Grammar {
Grammar {
lang: tree_sitter_scala::LANGUAGE.into(),
comment_nodes: vec!["comment", "block_comment"].into_iter().collect(),
string_literal_nodes: vec!["string"].into_iter().collect(),
loop_nodes: vec!["for_expression", "while_expression", "do_while_expression"]
.into_iter()
.collect(),
cond_nodes: vec!["if_expression", "match_expression"]
.into_iter()
.collect(),
function_nodes: vec!["function_definition"].into_iter().collect(),
function_call_nodes: vec!["call_expression"].into_iter().collect(),
param_seq_nodes: vec!["parameters"].into_iter().collect(),
param_nodes: vec!["parameter"].into_iter().collect(),
param_type_field: Some("type"),
name_field: "name",
}
}
fn fortran_grammar() -> Grammar {
Grammar {
lang: tree_sitter_fortran::LANGUAGE.into(),
comment_nodes: vec!["preproc_comment", "comment"].into_iter().collect(),
string_literal_nodes: vec!["string_literal"].into_iter().collect(),
loop_nodes: vec![
"loop_control_expression",
"where_statement",
"forall_statement",
"concurrent_statement",
"while_statement",
]
.into_iter()
.collect(),
cond_nodes: vec![
"if_statement",
"arithmetic_if_statement",
"select_case_statement",
"select_rank_statement",
"select_type_statement",
]
.into_iter()
.collect(),
function_nodes: vec!["function", "subroutine"].into_iter().collect(),
function_call_nodes: vec!["call_expression", "subroutine_call"]
.into_iter()
.collect(),
param_seq_nodes: vec!["parameters"].into_iter().collect(),
param_nodes: vec!["identifier"].into_iter().collect(),
param_type_field: None,
name_field: "name",
}
}
fn python_grammar() -> Grammar {
Grammar {
lang: tree_sitter_python::LANGUAGE.into(),
comment_nodes: vec!["comment"].into_iter().collect(),
string_literal_nodes: vec!["string"].into_iter().collect(),
loop_nodes: vec!["for_statement", "while_statement"]
.into_iter()
.collect(),
cond_nodes: vec!["if_statement", "conditional_expression", "match_statement"]
.into_iter()
.collect(),
function_nodes: vec!["function_definition", "lambda"].into_iter().collect(),
function_call_nodes: vec!["call"].into_iter().collect(),
param_seq_nodes: vec!["parameters"].into_iter().collect(),
param_nodes: vec!["parameter"].into_iter().collect(),
param_type_field: None,
name_field: "name",
}
}
fn language_to_grammar(lang: &str) -> Option<Grammar> {
match lang {
"c" => Some(c_grammar()),
"c++" => Some(cpp_grammar()),
"c#" => Some(cs_grammar()),
"java" => Some(java_grammar()),
"fortran" => Some(fortran_grammar()),
"python" => Some(python_grammar()),
"typescript" => Some(ts_grammar()),
"go" => Some(go_grammar()),
"scala" => Some(scala_grammar()),
_ => None,
}
}
fn count_nodes_of_kind(root: &Node, kinds: &HashSet<&str>) -> (usize, usize) {
let mut node_count = 0;
let mut max_nesting = 0;
let mut cursor = root.walk();
let mut call_stack: Vec<(Node, usize)> = Vec::new();
call_stack.push((*root, 1));
while let Some((node, depth)) = call_stack.pop() {
let is_of_kind = kinds.contains(node.kind());
if is_of_kind {
node_count += 1;
max_nesting = max_nesting.max(depth);
}
for child in node.children(&mut cursor) {
call_stack.push((child, if is_of_kind { depth + 1 } else { depth }));
}
}
(node_count, max_nesting)
}
fn find_first_node<'a>(
node: &Node<'a>,
pred: &dyn Fn(&Node) -> bool,
breadth: bool,
) -> Vec<Node<'a>> {
let mut cursor = node.walk();
let mut call_stack: Vec<(Node, usize)> = Vec::new();
call_stack.push((*node, 0));
let mut res: Vec<Node<'a>> = Vec::new();
let mut max_depth: Option<usize> = None;
while let Some((node, depth)) = call_stack.pop() {
if max_depth.filter(|&d| depth > d).is_some() {
return res;
} else if pred(&node) {
if breadth {
res.push(node);
if max_depth.is_none() {
max_depth = Some(depth);
}
} else {
return vec![node];
}
} else if breadth {
let mut end_queue: Vec<(Node, usize)> =
node.children(&mut cursor).map(|c| (c, depth + 1)).collect();
end_queue.extend(call_stack);
call_stack = end_queue;
} else {
for c in node
.children(&mut cursor)
.collect::<Vec<_>>()
.into_iter()
.rev()
{
call_stack.push((c, 0));
}
}
}
vec![]
}
fn find_first_node_of_kind<'a>(
root: &Node<'a>,
kind: &HashSet<&str>,
breadth: bool,
) -> Vec<Node<'a>> {
find_first_node(root, &|n: &Node| kind.contains(n.kind()), breadth)
}
fn find_first_error_node<'a>(root: &Node<'a>) -> Option<Node<'a>> {
find_first_node(root, &|n: &Node| n.is_error() || n.is_missing(), false)
.into_iter()
.next()
}
fn find_first_error_position(root: &Node) -> Option<(usize, usize)> {
find_first_error_node(root).map(|n| (n.start_position().row + 1, n.start_position().column + 1))
}
fn position_to_string(position: Option<(usize, usize)>) -> String {
match position {
Some((row, col)) => format!("{row}:{col}"),
None => "not-found".to_string(),
}
}
fn find_fields<'a>(root: &Node<'a>, field: &str) -> Vec<Node<'a>> {
let mut res: Vec<Node<'a>> = Vec::new();
let mut ids: HashSet<usize> = HashSet::new();
let mut cursor = root.walk();
let mut call_stack: Vec<Node> = Vec::new();
call_stack.push(*root);
while let Some(node) = call_stack.pop() {
for c in node.children_by_field_name(field, &mut node.walk()) {
res.push(c);
ids.insert(c.id());
}
for c in node
.children(&mut cursor)
.collect::<Vec<_>>()
.into_iter()
.rev()
{
if !ids.contains(&c.id()) {
call_stack.push(c);
}
}
}
res
}
fn find_first_field<'a>(root: &Node<'a>, field: &str) -> Option<Node<'a>> {
let mut cursor = root.walk();
let mut call_stack: Vec<Node> = Vec::new();
call_stack.push(*root);
while let Some(node) = call_stack.pop() {
if let Some(c) = node.child_by_field_name(field) {
return Some(c);
}
for c in node
.children(&mut cursor)
.collect::<Vec<_>>()
.into_iter()
.rev()
{
call_stack.push(c);
}
}
None
}
fn find_kind<'a>(root: &Node<'a>, kinds: &HashSet<&str>) -> Vec<Node<'a>> {
let mut res: Vec<Node<'a>> = Vec::new();
let mut cursor = root.walk();
let mut call_stack: Vec<Node> = Vec::new();
call_stack.push(*root);
while let Some(node) = call_stack.pop() {
if kinds.contains(node.kind()) {
res.push(node);
} else {
for c in node.children(&mut cursor) {
call_stack.push(c);
}
}
}
res
}
fn remove_kind_from_source(source: &[u8], root: &Node, kinds: &HashSet<&str>) -> Vec<u8> {
let mut nodes = find_kind(root, kinds);
nodes.sort_by_key(|b| std::cmp::Reverse(b.start_byte()));
let nodes = nodes;
let root_start = root.start_byte();
let mut new_source = source.to_vec();
for n in nodes {
new_source.drain(n.start_byte() - root_start..n.end_byte() - root_start);
}
new_source
}
#[cfg(test)]
mod tests {
use std::path::Path;
use polars::prelude::SortMultipleOptions;
use crate::utils::dataframes;
use crate::utils::dataframes::*;
use crate::utils::fs::*;
use crate::utils::logger::test_logger;
use super::*;
const TEST_DATA: &str = "tests/data/phases/parse";
fn test_parse(
input_file_path: &str,
keywords: &[&str],
languages: Option<Vec<&str>>,
should_pass: bool,
) -> Result<()> {
let input_df = open_csv(input_file_path, None, None)?;
ensure!(
has_column(&input_df, "name"),
"Input dataframe must have a 'name' column"
);
let input_df: Vec<&str> = dataframes::str(&input_df, "name")?;
let output_file_path = format!("{input_file_path}.functions.csv");
delete_file(&output_file_path, true)?;
let logs_file_path = format!("{input_file_path}.function_logs.csv");
delete_file(&logs_file_path, true)?;
for path in input_df.iter() {
delete_dir(format!("{path}.functions"), true)?;
}
if should_pass {
run(
input_file_path,
None,
None,
keywords,
languages,
"ignore",
8,
0,
false,
test_logger(),
)?;
let logs_df = open_csv(&logs_file_path, None, None)?;
ensure!(
has_column(&logs_df, "name"),
"Logs dataframe must have a 'name' column"
);
let sorted_logs_df = logs_df
.sort(vec!["name"], SortMultipleOptions::new())
.unwrap();
let expected_logs_df = open_csv(
&format!("{input_file_path}.function_logs.csv.expected"),
None,
None,
)?;
ensure!(
has_column(&expected_logs_df, "name"),
"Expected logs dataframe must have a 'name' column"
);
let sorted_expected_logs_df = expected_logs_df
.sort(vec!["name"], SortMultipleOptions::new())
.unwrap();
assert_eq!(sorted_expected_logs_df, sorted_logs_df);
let output_df = open_csv(&output_file_path, None, None)?;
ensure!(
has_column(&output_df, "path"),
"Output dataframe must have a 'path' column"
);
let sorted_output_df = output_df.sort(vec!["path"], SortMultipleOptions::new())?;
let expected_df = open_csv(&format!("{output_file_path}.expected"), None, None)?;
ensure!(
has_column(&expected_df, "path"),
"Expected dataframe must have a 'path' column"
);
let sorted_expected_df = expected_df.sort(vec!["path"], SortMultipleOptions::new())?;
assert_eq!(sorted_expected_df, sorted_output_df);
for path in dataframes::str(&sorted_output_df, "path")? {
let path = Path::new(path);
ensure!(path.exists(), "Parsed file not found: {}", path.display());
let expected_path_name = format!(
"{}.expected/{}",
path.parent()
.with_context(|| "Failed to get parent directory")?
.to_str()
.with_context(|| "Failed to convert parent directory to string")?,
path.file_name()
.with_context(|| "Failed to get file name")?
.to_str()
.with_context(|| "Failed to convert file name to string")?
);
let expected_path = Path::new(&expected_path_name);
assert_eq!(
std::fs::read_to_string(path)?,
std::fs::read_to_string(expected_path)?
);
}
} else {
ensure!(run(
input_file_path,
None,
None,
keywords,
languages,
"ignore",
8,
0,
false,
test_logger()
)
.is_err());
}
delete_file(&output_file_path, true)?;
delete_file(&logs_file_path, true)?;
for path in input_df {
delete_dir(format!("{path}.functions"), true)?;
}
Ok(())
}
#[test]
fn parse_fp() -> Result<()> {
let keywords = vec![
"tests/data/keywords/fp_types.json",
"tests/data/keywords/fp_transcendental.json",
"tests/data/keywords/fp_others.json",
"tests/data/keywords/long_double.json",
];
let input_file_path = format!("{TEST_DATA}/to_parse.csv");
test_parse(&input_file_path, &keywords, None, true)
}
#[test]
fn parse_go() -> Result<()> {
let keywords = vec![
"tests/data/keywords/fp_types.json",
"tests/data/keywords/fp_transcendental.json",
"tests/data/keywords/fp_others.json",
];
let input_file_path = format!("{TEST_DATA}/parse_go.csv");
test_parse(&input_file_path, &keywords, None, true)
}
#[test]
fn invalid_file() -> Result<()> {
let keywords = vec!["tests/data/keywords/c_float.json"];
let input_file_path = format!("{TEST_DATA}/invalid.csv");
test_parse(&input_file_path, &keywords, None, true)
}
#[test]
fn invalid_lang() -> Result<()> {
let keywords = vec!["tests/data/keywords/scala_float.json"];
let input_file_path = format!("{TEST_DATA}/empty.csv");
test_parse(&input_file_path, &keywords, Some(["rust"].to_vec()), false)
}
#[test]
fn empty() -> Result<()> {
let keywords = vec!["tests/data/keywords/scala_float.json"];
let input_file_path = format!("{TEST_DATA}/empty.csv");
test_parse(&input_file_path, &keywords, Some(["c"].to_vec()), true)
}
}