use super::types::{ChunkType, RawChunk};
const JSON_MAX_LINES: usize = 500;
const PLAINTEXT_MAX_LINES: usize = 50;
pub fn chunk_document(file: &str, content: &str) -> Option<Vec<RawChunk>> {
let ext = std::path::Path::new(file)
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_ascii_lowercase();
let chunks = match ext.as_str() {
"md" | "mdx" => chunk_markdown(file, content),
"yaml" | "yml" => chunk_yaml(file, content),
"toml" => chunk_toml(file, content),
"json" => chunk_json(file, content)?,
"txt" | "log" => chunk_plaintext(file, content),
"xml" => chunk_xml(file, content),
_ => return None,
};
Some(chunks)
}
pub(super) fn document_chunk(
file: &str,
start_line: usize,
end_line: usize,
content: String,
function_name: Option<String>,
language: &str,
chunk_type: ChunkType,
) -> RawChunk {
let id = match &function_name {
Some(name) if !name.is_empty() => {
format!("{file}::{}::{name}::{start_line}", chunk_type.as_str())
}
_ => format!("{file}:{start_line}:{end_line}"),
};
RawChunk {
id,
file: file.to_string(),
start_line,
end_line,
content,
function_name,
language: Some(language.to_string()),
chunk_type,
calls: Vec::new(),
inherits_from: Vec::new(),
chunk_depth: 0,
parent_chunk_id: None,
child_chunk_ids: Vec::new(),
nlp_keywords: Vec::new(),
nlp_code_refs: Vec::new(),
virtual_terms: Vec::new(),
}
}
pub(super) fn chunk_markdown(file: &str, content: &str) -> Vec<RawChunk> {
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
return Vec::new();
}
let mut out: Vec<RawChunk> = Vec::new();
let mut section_start = 0usize;
let mut section_heading: Option<String> = None;
let mut in_code_fence = false;
let flush = |out: &mut Vec<RawChunk>,
start: usize,
end: usize,
heading: &Option<String>,
lines: &[&str]| {
if start >= end {
return;
}
let text = lines[start..end].join("\n");
if text.trim().is_empty() {
return;
}
out.push(document_chunk(
file,
start + 1,
end,
text,
heading.clone(),
"markdown",
ChunkType::Docstring,
));
};
for (i, line) in lines.iter().enumerate() {
let trimmed = line.trim_start();
if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
in_code_fence = !in_code_fence;
continue;
}
if in_code_fence {
continue;
}
if trimmed.starts_with('#') {
flush(&mut out, section_start, i, §ion_heading, &lines);
let heading = trimmed.trim_start_matches('#').trim().to_string();
section_heading = if heading.is_empty() {
None
} else {
Some(heading)
};
section_start = i;
}
}
flush(
&mut out,
section_start,
lines.len(),
§ion_heading,
&lines,
);
if out.is_empty() {
out.push(document_chunk(
file,
1,
lines.len(),
content.to_string(),
None,
"markdown",
ChunkType::Docstring,
));
}
out
}
pub(super) fn chunk_yaml(file: &str, content: &str) -> Vec<RawChunk> {
chunk_by_top_level_key(file, content, "yaml", |line| {
let trimmed = line.trim_end();
if trimmed.is_empty() || trimmed.starts_with('#') {
return None;
}
if !line.starts_with(|c: char| c.is_whitespace() || c == '-') {
if let Some(idx) = trimmed.find(':') {
let key = trimmed[..idx].trim();
if !key.is_empty() && !key.contains(' ') {
return Some(key.to_string());
}
}
}
None
})
}
pub(super) fn chunk_toml(file: &str, content: &str) -> Vec<RawChunk> {
chunk_by_top_level_key(file, content, "toml", |line| {
let trimmed = line.trim_end();
if trimmed.starts_with('[') && trimmed.ends_with(']') {
let inner = trimmed
.trim_start_matches('[')
.trim_end_matches(']')
.trim_start_matches('[')
.trim_end_matches(']')
.trim()
.to_string();
if !inner.is_empty() {
return Some(inner);
}
}
None
})
}
fn chunk_by_top_level_key(
file: &str,
content: &str,
language: &str,
header_of: impl Fn(&str) -> Option<String>,
) -> Vec<RawChunk> {
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
return Vec::new();
}
let mut out: Vec<RawChunk> = Vec::new();
let mut section_start = 0usize;
let mut section_name: Option<String> = None;
let flush = |out: &mut Vec<RawChunk>,
start: usize,
end: usize,
name: &Option<String>,
lines: &[&str]| {
if start >= end {
return;
}
let text = lines[start..end].join("\n");
if text.trim().is_empty() {
return;
}
out.push(document_chunk(
file,
start + 1,
end,
text,
name.clone(),
language,
ChunkType::Constant,
));
};
for (i, line) in lines.iter().enumerate() {
if let Some(name) = header_of(line) {
flush(&mut out, section_start, i, §ion_name, &lines);
section_name = Some(name);
section_start = i;
}
}
flush(&mut out, section_start, lines.len(), §ion_name, &lines);
if out.is_empty() {
out.push(document_chunk(
file,
1,
lines.len(),
content.to_string(),
None,
language,
ChunkType::Constant,
));
}
out
}
pub(super) fn chunk_json(file: &str, content: &str) -> Option<Vec<RawChunk>> {
let line_count = content.lines().count();
if line_count == 0 {
return Some(Vec::new());
}
if line_count >= JSON_MAX_LINES {
return Some(Vec::new());
}
Some(vec![document_chunk(
file,
1,
line_count,
content.to_string(),
None,
"json",
ChunkType::Constant,
)])
}
pub(super) fn chunk_plaintext(file: &str, content: &str) -> Vec<RawChunk> {
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
return Vec::new();
}
let lang = match std::path::Path::new(file)
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_ascii_lowercase()
.as_str()
{
"log" => "log",
_ => "text",
};
let mut out: Vec<RawChunk> = Vec::new();
let mut buf_start: Option<usize> = None;
let push_buf =
|out: &mut Vec<RawChunk>, start: usize, end: usize, lines: &[&str], lang: &str| {
let mut s = start;
while s < end {
let e = (s + PLAINTEXT_MAX_LINES).min(end);
let text = lines[s..e].join("\n");
if !text.trim().is_empty() {
out.push(document_chunk(
file,
s + 1,
e,
text,
None,
lang,
ChunkType::Code,
));
}
s = e;
}
};
for (i, line) in lines.iter().enumerate() {
if line.trim().is_empty() {
if let Some(start) = buf_start.take() {
push_buf(&mut out, start, i, &lines, lang);
}
} else if buf_start.is_none() {
buf_start = Some(i);
}
}
if let Some(start) = buf_start {
push_buf(&mut out, start, lines.len(), &lines, lang);
}
if out.is_empty() {
out.push(document_chunk(
file,
1,
lines.len(),
content.to_string(),
None,
lang,
ChunkType::Code,
));
}
out
}
pub(super) fn chunk_xml(file: &str, content: &str) -> Vec<RawChunk> {
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
return Vec::new();
}
let mut out: Vec<RawChunk> = Vec::new();
let mut depth: i32 = 0;
let mut child_start: Option<usize> = None;
let mut child_name: Option<String> = None;
for (i, line) in lines.iter().enumerate() {
let opens = count_xml_opens(line);
let closes = count_xml_closes(line);
if depth == 1 && child_start.is_none() && opens > closes {
child_start = Some(i);
child_name = first_xml_tag_name(line);
}
let prev_depth = depth;
depth += opens as i32;
depth -= closes as i32;
depth = depth.max(0);
if let Some(start) = child_start {
if depth <= 1 && prev_depth >= 1 && i >= start {
let text = lines[start..=i].join("\n");
if !text.trim().is_empty() {
out.push(document_chunk(
file,
start + 1,
i + 1,
text,
child_name.clone(),
"xml",
ChunkType::Class,
));
}
child_start = None;
child_name = None;
}
}
}
if out.is_empty() {
out.push(document_chunk(
file,
1,
lines.len(),
content.to_string(),
None,
"xml",
ChunkType::Class,
));
}
out
}
fn count_xml_opens(line: &str) -> usize {
let mut count = 0usize;
let bytes = line.as_bytes();
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b'<' {
let rest = &line[i..];
if rest.starts_with("<?")
|| rest.starts_with("<!--")
|| rest.starts_with("<!")
|| rest.starts_with("</")
{
i += 1;
continue;
}
if let Some(close) = rest.find('>') {
let tag = &rest[..=close];
if !tag.ends_with("/>") {
count += 1;
}
i += close + 1;
continue;
}
}
i += 1;
}
count
}
fn count_xml_closes(line: &str) -> usize {
line.matches("</").count()
}
fn first_xml_tag_name(line: &str) -> Option<String> {
let start = line.find('<')?;
let rest = &line[start + 1..];
if rest.starts_with('?') || rest.starts_with('!') || rest.starts_with('/') {
return None;
}
let end = rest
.find(|c: char| c.is_whitespace() || c == '>' || c == '/')
.unwrap_or(rest.len());
let name = rest[..end].trim();
if name.is_empty() {
None
} else {
Some(name.to_string())
}
}