use std::{
borrow::Cow,
fmt,
fs::File,
io::{self, BufRead, BufReader, Read},
path::{Path, PathBuf},
str::FromStr,
};
use crate::{
config::Config,
language::syntax::{FileContext, LanguageContext, SyntaxCounter},
stats::{CodeStats, Report},
utils::{ext::SliceExt, fs as fsutils},
};
use encoding_rs_io::DecodeReaderBytesBuilder;
use grep_searcher::{LineIter, LineStep};
use rayon::prelude::*;
use self::LanguageType::*;
include!(concat!(env!("OUT_DIR"), "/language_type.rs"));
impl LanguageType {
pub fn parse(self, path: PathBuf, config: &Config) -> Result<Report, (io::Error, PathBuf)> {
let text = {
let f = match File::open(&path) {
Ok(f) => f,
Err(e) => return Err((e, path)),
};
let mut s = Vec::new();
let mut reader = DecodeReaderBytesBuilder::new().build(f);
if let Err(e) = reader.read_to_end(&mut s) {
return Err((e, path));
}
s
};
let mut stats = Report::new(path);
stats += self.parse_from_slice(&text, config);
Ok(stats)
}
pub fn parse_from_str<A: AsRef<str>>(self, text: A, config: &Config) -> CodeStats {
self.parse_from_slice(text.as_ref().as_bytes(), config)
}
pub fn parse_from_slice<A: AsRef<[u8]>>(self, text: A, config: &Config) -> CodeStats {
let text = text.as_ref();
if self == LanguageType::Jupyter {
return self
.parse_jupyter(text.as_ref(), config)
.unwrap_or_else(CodeStats::new);
}
let syntax = SyntaxCounter::new(self);
if let Some(end) = syntax
.shared
.important_syntax
.earliest_find(text)
.and_then(|m| {
text[..=m.start()]
.iter()
.rev()
.position(|&c| c == b'\n')
.filter(|&p| p != 0)
.map(|p| m.start() - p)
})
{
let (skippable_text, rest) = text.split_at(end + 1);
let is_fortran = syntax.shared.is_fortran;
let is_literate = syntax.shared.is_literate;
let comments = syntax.shared.line_comments;
trace!(
"Using Simple Parse on {:?}",
String::from_utf8_lossy(skippable_text)
);
let parse_lines = move || self.parse_lines(config, rest, CodeStats::new(), syntax);
let simple_parse = move || {
LineIter::new(b'\n', skippable_text)
.par_bridge()
.map(|line| {
let line = if is_fortran { line } else { line.trim() };
if line.trim().is_empty() {
(1, 0, 0)
} else if is_literate
|| comments.iter().any(|c| line.starts_with(c.as_bytes()))
{
(0, 0, 1)
} else {
(0, 1, 0)
}
})
.reduce(|| (0, 0, 0), |a, b| (a.0 + b.0, a.1 + b.1, a.2 + b.2))
};
let (mut stats, (blanks, code, comments)) = rayon::join(parse_lines, simple_parse);
stats.blanks += blanks;
stats.code += code;
stats.comments += comments;
stats
} else {
self.parse_lines(config, text, CodeStats::new(), syntax)
}
}
#[inline]
fn parse_lines(
self,
config: &Config,
lines: &[u8],
mut stats: CodeStats,
mut syntax: SyntaxCounter,
) -> CodeStats {
let mut stepper = LineStep::new(b'\n', 0, lines.len());
while let Some((start, end)) = stepper.next(lines) {
let line = &lines[start..end];
let line = if syntax.shared.is_fortran {
line
} else {
line.trim()
};
trace!("{}", String::from_utf8_lossy(line));
if syntax.can_perform_single_line_analysis(line, &mut stats) {
continue;
}
let started_in_comments = !syntax.stack.is_empty()
|| (config.treat_doc_strings_as_comments == Some(true)
&& syntax.quote.is_some()
&& syntax.quote_is_doc_quote);
let ended_with_comments =
match syntax.perform_multi_line_analysis(lines, start, end, config) {
crate::language::syntax::AnalysisReport::Normal(end) => end,
crate::language::syntax::AnalysisReport::ChildLanguage(FileContext {
language,
end,
stats: blob,
}) => {
match language {
LanguageContext::Markdown { balanced, language } => {
stats.comments += if balanced { 2 } else { 1 };
*stats.blobs.entry(language).or_default() += blob;
}
LanguageContext::Rust => {
*stats.blobs.entry(LanguageType::Markdown).or_default() += blob;
}
LanguageContext::Html { language } => {
stats.code += 1;
*stats.blobs.entry(language).or_default() += blob;
}
}
stepper = LineStep::new(b'\n', end, lines.len());
continue;
}
};
trace!("{}", String::from_utf8_lossy(line));
if syntax.shared.is_literate
|| syntax.line_is_comment(line, config, ended_with_comments, started_in_comments)
{
stats.comments += 1;
trace!("Comment No.{}", stats.comments);
trace!("Was the Comment stack empty?: {}", !started_in_comments);
} else {
stats.code += 1;
trace!("Code No.{}", stats.code);
}
}
stats
}
fn parse_jupyter(&self, json: &[u8], config: &Config) -> Option<CodeStats> {
#[derive(Deserialize)]
struct Jupyter {
cells: Vec<JupyterCell>,
metadata: JupyterMetadata,
}
#[derive(Clone, Copy, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
enum CellType {
Markdown,
Code,
}
#[derive(Deserialize)]
struct JupyterCell {
cell_type: CellType,
source: Vec<String>,
}
#[derive(Deserialize)]
struct JupyterMetadata {
kernelspec: serde_json::Value,
language_info: serde_json::Value,
}
let jupyter: Jupyter = serde_json::from_slice(json).ok()?;
let mut jupyter_stats = CodeStats::new();
let language = jupyter
.metadata
.kernelspec
.get("language")
.and_then(serde_json::Value::as_str)
.and_then(|v| LanguageType::from_str(v).ok())
.or_else(|| {
jupyter
.metadata
.language_info
.get("file_extension")
.and_then(serde_json::Value::as_str)
.and_then(LanguageType::from_file_extension)
})
.unwrap_or(LanguageType::Python);
let iter = jupyter
.cells
.par_iter()
.map(|cell| match cell.cell_type {
CellType::Markdown => (
LanguageType::Markdown,
LanguageType::Markdown.parse_from_str(cell.source.join(""), config),
),
CellType::Code => (
language,
language.parse_from_str(cell.source.join(""), config),
),
})
.collect::<Vec<_>>();
for (language, stats) in iter {
*jupyter_stats.blobs.entry(language).or_default() += stats;
}
Some(jupyter_stats)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn rust_allows_nested() {
assert!(LanguageType::Rust.allows_nested());
}
}