use crate::types::{Element, ElementId, ElementMetadata, ElementType};
use std::collections::HashMap;
use super::types::{ListItemMetadata, ListType};
pub fn detect_list_items(text: &str) -> Vec<ListItemMetadata> {
let mut items = Vec::new();
let lines: Vec<&str> = text.lines().collect();
let mut current_byte_offset = 0;
for line in lines {
let line_start_offset = current_byte_offset;
let trimmed = line.trim_start();
let indent_level = (line.len() - trimmed.len()) / 2;
let byte_end = line_start_offset + line.len();
let next_offset = if byte_end < text.len() {
let rest = &text.as_bytes()[byte_end..];
if rest.starts_with(b"\r\n") {
byte_end + 2
} else if rest.starts_with(b"\n") || rest.starts_with(b"\r") {
byte_end + 1
} else {
byte_end
}
} else {
byte_end
};
if let Some(stripped) = trimmed.strip_prefix('-')
&& (stripped.starts_with(' ') || stripped.is_empty())
{
items.push(ListItemMetadata {
list_type: ListType::Bullet,
byte_start: line_start_offset,
byte_end,
indent_level: indent_level as u32,
});
current_byte_offset = next_offset;
continue;
}
if let Some(stripped) = trimmed.strip_prefix('*')
&& (stripped.starts_with(' ') || stripped.is_empty())
{
items.push(ListItemMetadata {
list_type: ListType::Bullet,
byte_start: line_start_offset,
byte_end,
indent_level: indent_level as u32,
});
current_byte_offset = next_offset;
continue;
}
if let Some(stripped) = trimmed.strip_prefix('•')
&& (stripped.starts_with(' ') || stripped.is_empty())
{
items.push(ListItemMetadata {
list_type: ListType::Bullet,
byte_start: line_start_offset,
byte_end,
indent_level: indent_level as u32,
});
current_byte_offset = next_offset;
continue;
}
if let Some(pos) = trimmed.find('.') {
let prefix = &trimmed[..pos];
if prefix.chars().all(|c| c.is_ascii_digit())
&& pos > 0
&& pos < 3
&& trimmed.len() > pos + 1
&& trimmed[pos + 1..].starts_with(' ')
{
items.push(ListItemMetadata {
list_type: ListType::Numbered,
byte_start: line_start_offset,
byte_end,
indent_level: indent_level as u32,
});
current_byte_offset = next_offset;
continue;
}
}
if let Some(pos) = trimmed.find('.') {
let prefix = &trimmed[..pos];
if prefix.len() == 1
&& prefix.chars().all(|c| c.is_alphabetic())
&& pos > 0
&& trimmed.len() > pos + 1
&& trimmed[pos + 1..].starts_with(' ')
{
items.push(ListItemMetadata {
list_type: ListType::Lettered,
byte_start: line_start_offset,
byte_end,
indent_level: indent_level as u32,
});
current_byte_offset = next_offset;
continue;
}
}
if indent_level >= 2 && !trimmed.is_empty() {
items.push(ListItemMetadata {
list_type: ListType::Indented,
byte_start: line_start_offset,
byte_end,
indent_level: indent_level as u32,
});
current_byte_offset = next_offset;
continue;
}
current_byte_offset = next_offset;
}
items
}
pub fn generate_element_id(text: &str, element_type: ElementType, page_number: Option<usize>) -> ElementId {
let type_hash = format!("{:?}", element_type)
.bytes()
.fold(0u64, |acc, b| acc.wrapping_mul(31).wrapping_add(b as u64));
let text_hash = text
.bytes()
.fold(0u64, |acc, b| acc.wrapping_mul(31).wrapping_add(b as u64));
let page_hash = page_number
.unwrap_or(1)
.to_string()
.bytes()
.fold(0u64, |acc, b| acc.wrapping_mul(31).wrapping_add(b as u64));
let combined = type_hash
.wrapping_mul(65599)
.wrapping_add(text_hash)
.wrapping_mul(65599)
.wrapping_add(page_hash);
ElementId::new(format!("elem-{:x}", combined)).expect("ElementId creation failed")
}
pub(super) fn add_paragraphs(elements: &mut Vec<Element>, text: &str, page_number: usize, title: &Option<String>) {
if text.is_empty() {
return;
}
for paragraph in text.split("\n\n").filter(|p| !p.trim().is_empty()) {
let para_text = paragraph.trim();
if para_text.is_empty() {
continue;
}
let element_id = generate_element_id(para_text, ElementType::NarrativeText, Some(page_number));
elements.push(Element {
element_id,
element_type: ElementType::NarrativeText,
text: para_text.to_string(),
metadata: ElementMetadata {
page_number: Some(page_number),
filename: title.clone(),
coordinates: None,
element_index: Some(elements.len()),
additional: HashMap::new(),
},
});
}
}