#[cfg(feature = "office")]
use crate::Result;
#[cfg(feature = "office")]
use crate::core::config::ExtractionConfig;
#[cfg(feature = "office")]
use crate::plugins::{DocumentExtractor, Plugin};
#[cfg(feature = "office")]
use crate::types::{ExtractionResult, Metadata, Table};
#[cfg(feature = "office")]
use ahash::AHashMap;
#[cfg(feature = "office")]
use async_trait::async_trait;
#[cfg(feature = "office")]
use std::borrow::Cow;
#[cfg(feature = "office")]
pub struct RstExtractor;
#[cfg(feature = "office")]
impl RstExtractor {
pub fn new() -> Self {
Self
}
fn extract_text_and_metadata(content: &str) -> (String, Metadata) {
let mut metadata = Metadata::default();
let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
let text = Self::extract_text_from_rst(content, &mut additional);
metadata.additional = additional;
(text, metadata)
}
fn extract_text_from_rst(content: &str, metadata: &mut AHashMap<Cow<'static, str>, serde_json::Value>) -> String {
let mut output = String::new();
let lines: Vec<&str> = content.lines().collect();
let mut i = 0;
while i < lines.len() {
let line = lines[i];
if line.trim().starts_with(':')
&& line.contains(':')
&& let Some((key, value)) = Self::parse_field_list_line(line)
{
let mut full_value = value.clone();
while i + 1 < lines.len() {
let next = lines[i + 1];
if !next.is_empty() && (next.starts_with(" ") || next.starts_with("\t")) {
full_value.push('\n');
full_value.push_str(next);
i += 1;
} else {
break;
}
}
Self::add_metadata_field(&key, &full_value, metadata);
output.push_str(&format!(":{}: {}\n", key, full_value));
i += 1;
continue;
}
if i + 1 < lines.len() {
let next_line = lines[i + 1];
if Self::is_section_underline(next_line) && !line.trim().is_empty() {
output.push_str(line.trim());
output.push('\n');
i += 2;
continue;
}
}
if line.trim().starts_with(".. code-block::") || line.trim().starts_with(".. code::") {
output.push_str(line.trim());
output.push('\n');
i += 1;
while i < lines.len() && lines[i].trim().is_empty() {
output.push('\n');
i += 1;
}
while i < lines.len() && (lines[i].starts_with(" ") || lines[i].is_empty()) {
output.push_str(lines[i]);
output.push('\n');
i += 1;
}
continue;
}
if line.trim().starts_with(".. highlight::") {
let lang = line.trim_start_matches(".. highlight::").trim().to_string();
if !lang.is_empty() {
output.push_str("highlight: ");
output.push_str(&lang);
output.push('\n');
}
i += 1;
continue;
}
if line.trim().ends_with("::") && !line.trim().starts_with(".. ") {
if let Some(display_text) = line.strip_suffix("::")
&& !display_text.trim().is_empty()
{
output.push_str(display_text.trim());
output.push('\n');
}
i += 1;
while i < lines.len() && (lines[i].starts_with(" ") || lines[i].is_empty()) {
if !lines[i].is_empty() {
output.push_str(lines[i].trim_start());
output.push('\n');
}
i += 1;
}
continue;
}
if Self::is_list_item(line) {
output.push_str(line.trim());
output.push('\n');
i += 1;
continue;
}
if line.trim().starts_with(".. ") || line.trim() == ".." {
let trimmed = line.trim();
let directive = if trimmed == ".." { "" } else { &trimmed[3..] };
if directive.starts_with("image::") {
let uri = directive.strip_prefix("image::").unwrap_or("").trim();
output.push_str("image: ");
output.push_str(uri);
output.push('\n');
i += 1;
continue;
}
if directive.starts_with("note::")
|| directive.starts_with("warning::")
|| directive.starts_with("important::")
|| directive.starts_with("caution::")
|| directive.starts_with("hint::")
|| directive.starts_with("tip::")
{
output.push_str(trimmed);
output.push('\n');
i += 1;
while i < lines.len() && (lines[i].starts_with(" ") || lines[i].is_empty()) {
if !lines[i].is_empty() {
output.push_str(lines[i]);
output.push('\n');
}
i += 1;
}
continue;
}
if directive.starts_with("math::") {
let math = directive.strip_prefix("math::").unwrap_or("").trim();
if !math.is_empty() {
output.push_str("math: ");
output.push_str(math);
output.push('\n');
}
i += 1;
while i < lines.len() && (lines[i].starts_with(" ") || lines[i].is_empty()) {
if !lines[i].is_empty() {
output.push_str(lines[i].trim());
output.push('\n');
}
i += 1;
}
continue;
}
i += 1;
while i < lines.len() && (lines[i].starts_with(" ") || lines[i].is_empty()) {
i += 1;
}
continue;
}
if !line.trim().is_empty() && !Self::is_markup_line(line) {
output.push_str(line);
output.push('\n');
}
i += 1;
}
output
}
fn parse_field_list_line(line: &str) -> Option<(String, String)> {
let trimmed = line.trim();
if !trimmed.starts_with(':') {
return None;
}
let rest = &trimmed[1..];
if let Some(end_pos) = rest.find(':') {
let key = rest[..end_pos].to_string();
let value = rest[end_pos + 1..].trim().to_string();
return Some((key, value));
}
None
}
fn add_metadata_field(key: &str, value: &str, metadata: &mut AHashMap<Cow<'static, str>, serde_json::Value>) {
let key_lower = key.to_lowercase();
match key_lower.as_str() {
"author" | "authors" => {
metadata.insert(Cow::Borrowed("author"), serde_json::Value::String(value.to_string()));
}
"date" => {
metadata.insert(Cow::Borrowed("date"), serde_json::Value::String(value.to_string()));
}
"version" | "revision" => {
metadata.insert(Cow::Borrowed("version"), serde_json::Value::String(value.to_string()));
}
"title" => {
metadata.insert(Cow::Borrowed("title"), serde_json::Value::String(value.to_string()));
}
_ => {
metadata.insert(
Cow::Owned(format!("field_{}", key_lower)),
serde_json::Value::String(value.to_string()),
);
}
}
}
fn is_section_underline(line: &str) -> bool {
let trimmed = line.trim();
if trimmed.len() < 3 {
return false;
}
let chars: Vec<char> = trimmed.chars().collect();
let first = chars[0];
matches!(first, '=' | '-' | '~' | '+' | '^' | '"' | '`' | '#' | '*') && chars.iter().all(|c| *c == first)
}
fn is_list_item(line: &str) -> bool {
let trimmed = line.trim_start();
if trimmed.starts_with("* ") || trimmed.starts_with("+ ") || trimmed.starts_with("- ") {
return true;
}
if let Some(space_pos) = trimmed.find(' ')
&& space_pos > 0
&& space_pos < 4
{
let prefix = &trimmed[..space_pos];
if prefix.ends_with('.') || prefix.ends_with(')') {
return prefix[..prefix.len() - 1].chars().all(|c| c.is_numeric());
}
}
false
}
fn is_markup_line(line: &str) -> bool {
let trimmed = line.trim();
if trimmed.len() < 3 {
return false;
}
let first = trimmed.chars().next().unwrap();
trimmed.chars().all(|c| c == first)
&& matches!(first, '=' | '-' | '~' | '+' | '^' | '"' | '`' | '#' | '*' | '/')
}
fn extract_tables(content: &str) -> Vec<Table> {
let mut tables = Vec::new();
let lines: Vec<&str> = content.lines().collect();
let mut i = 0;
while i < lines.len() {
let line = lines[i];
if line.contains("|")
&& (line.contains("=") || line.contains("-"))
&& let Some(table) = Self::parse_grid_table(&lines, &mut i)
{
tables.push(table);
continue;
}
i += 1;
}
tables
}
fn parse_grid_table(lines: &[&str], i: &mut usize) -> Option<Table> {
let mut cells = Vec::new();
let mut row = Vec::new();
while *i < lines.len() && lines[*i].contains("|") {
let line = lines[*i].trim_matches(|c| c == '|');
if !line.is_empty() {
let cell_content = line.split('|').map(|s| s.trim().to_string()).collect::<Vec<_>>();
row.extend(cell_content);
if !row.is_empty() {
cells.push(row.clone());
row.clear();
}
}
*i += 1;
}
if cells.is_empty() {
return None;
}
let markdown = Self::cells_to_markdown(&cells);
Some(Table {
cells,
markdown,
page_number: 1,
bounding_box: None,
})
}
fn cells_to_markdown(cells: &[Vec<String>]) -> String {
if cells.is_empty() {
return String::new();
}
let mut md = String::new();
if !cells.is_empty() {
md.push('|');
for cell in &cells[0] {
md.push(' ');
md.push_str(cell);
md.push_str(" |");
}
md.push('\n');
md.push('|');
for _ in &cells[0] {
md.push_str(" --- |");
}
md.push('\n');
for row in &cells[1..] {
md.push('|');
for cell in row {
md.push(' ');
md.push_str(cell);
md.push_str(" |");
}
md.push('\n');
}
}
md
}
}
#[cfg(feature = "office")]
impl Default for RstExtractor {
fn default() -> Self {
Self::new()
}
}
#[cfg(feature = "office")]
impl Plugin for RstExtractor {
fn name(&self) -> &str {
"rst-extractor"
}
fn version(&self) -> String {
env!("CARGO_PKG_VERSION").to_string()
}
fn initialize(&self) -> Result<()> {
Ok(())
}
fn shutdown(&self) -> Result<()> {
Ok(())
}
fn description(&self) -> &str {
"Native Rust extractor for reStructuredText (RST) documents"
}
fn author(&self) -> &str {
"Kreuzberg Team"
}
}
#[cfg(feature = "office")]
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
impl DocumentExtractor for RstExtractor {
#[cfg_attr(
feature = "otel",
tracing::instrument(
skip(self, content, _config),
fields(
extractor.name = self.name(),
content.size_bytes = content.len(),
)
)
)]
async fn extract_bytes(
&self,
content: &[u8],
mime_type: &str,
_config: &ExtractionConfig,
) -> Result<ExtractionResult> {
let text = String::from_utf8_lossy(content).into_owned();
let (extracted_text, metadata) = Self::extract_text_and_metadata(&text);
let tables = Self::extract_tables(&text);
Ok(ExtractionResult {
content: extracted_text,
mime_type: mime_type.to_string().into(),
metadata,
tables,
detected_languages: None,
chunks: None,
images: None,
djot_content: None,
pages: None,
elements: None,
ocr_elements: None,
document: None,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
annotations: None,
})
}
fn supported_mime_types(&self) -> &[&str] {
&["text/x-rst", "text/prs.fallenstein.rst"]
}
fn priority(&self) -> i32 {
50
}
}
#[cfg(all(test, feature = "office"))]
mod tests {
use super::*;
#[test]
fn test_rst_extractor_plugin_interface() {
let extractor = RstExtractor::new();
assert_eq!(extractor.name(), "rst-extractor");
assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
assert_eq!(extractor.priority(), 50);
assert!(!extractor.supported_mime_types().is_empty());
}
#[test]
fn test_rst_extractor_supports_text_x_rst() {
let extractor = RstExtractor::new();
assert!(extractor.supported_mime_types().contains(&"text/x-rst"));
}
#[test]
fn test_rst_extractor_supports_fallenstein_rst() {
let extractor = RstExtractor::new();
assert!(extractor.supported_mime_types().contains(&"text/prs.fallenstein.rst"));
}
#[test]
fn test_extract_text_from_rst_simple_document() {
let content = r#"
Title
=====
This is a paragraph.
Another paragraph.
"#;
let mut metadata = AHashMap::new();
let output = RstExtractor::extract_text_from_rst(content, &mut metadata);
assert!(output.contains("Title"));
assert!(output.contains("This is a paragraph"));
assert!(output.contains("Another paragraph"));
}
#[test]
fn test_extract_text_from_rst_with_code_block() {
let content = r#"
.. code-block:: python
def hello():
print("world")
Some text after.
"#;
let mut metadata = AHashMap::new();
let output = RstExtractor::extract_text_from_rst(content, &mut metadata);
assert!(output.contains("code-block"));
assert!(output.contains("def hello"));
assert!(output.contains("Some text after"));
}
#[test]
fn test_extract_text_from_rst_with_metadata() {
let content = r#"
:Author: John Doe
:Date: 2024-01-15
First paragraph.
Second paragraph.
"#;
let mut metadata = AHashMap::new();
let output = RstExtractor::extract_text_from_rst(content, &mut metadata);
assert!(output.contains("First paragraph"));
assert!(output.contains("Second paragraph"));
assert!(metadata.contains_key("author"));
assert_eq!(metadata.get("author").and_then(|v| v.as_str()), Some("John Doe"));
}
#[test]
fn test_cells_to_markdown_format() {
let cells = vec![
vec!["Name".to_string(), "Age".to_string()],
vec!["Alice".to_string(), "30".to_string()],
vec!["Bob".to_string(), "25".to_string()],
];
let markdown = RstExtractor::cells_to_markdown(&cells);
assert!(markdown.contains("Name"));
assert!(markdown.contains("Age"));
assert!(markdown.contains("Alice"));
assert!(markdown.contains("Bob"));
assert!(markdown.contains("---"));
}
#[test]
fn test_rst_extractor_default() {
let extractor = RstExtractor;
assert_eq!(extractor.name(), "rst-extractor");
}
#[test]
fn test_rst_extractor_initialize_shutdown() {
let extractor = RstExtractor::new();
assert!(extractor.initialize().is_ok());
assert!(extractor.shutdown().is_ok());
}
}