#[cfg(feature = "office")]
use crate::Result;
#[cfg(feature = "office")]
use crate::core::config::ExtractionConfig;
#[cfg(feature = "office")]
use crate::plugins::{DocumentExtractor, Plugin};
#[cfg(feature = "office")]
use crate::types::builder::DocumentStructureBuilder;
#[cfg(feature = "office")]
use crate::types::document_structure::{AnnotationKind, DocumentStructure, TextAnnotation};
#[cfg(feature = "office")]
use crate::types::{ExtractionResult, Metadata, Table};
#[cfg(feature = "office")]
use ahash::AHashMap;
#[cfg(feature = "office")]
use async_trait::async_trait;
#[cfg(feature = "office")]
use org::Org;
#[cfg(feature = "office")]
use std::borrow::Cow;
#[cfg(feature = "office")]
pub struct OrgModeExtractor;
#[cfg(feature = "office")]
impl OrgModeExtractor {
pub fn new() -> Self {
Self
}
fn extract_metadata_and_content(org_text: &str, org: &Org) -> (Metadata, String) {
let mut metadata = Metadata::default();
let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = Default::default();
for line in org_text.lines().take(100) {
let trimmed = line.trim();
if let Some(rest) = trimmed.strip_prefix("#+TITLE:") {
let value = rest.trim().to_string();
additional.insert(Cow::Borrowed("title"), serde_json::json!(value));
} else if let Some(rest) = trimmed.strip_prefix("#+AUTHOR:") {
let value = rest.trim().to_string();
additional.insert(Cow::Borrowed("author"), serde_json::json!(&value));
additional.insert(Cow::Borrowed("authors"), serde_json::json!(vec![value]));
} else if let Some(rest) = trimmed.strip_prefix("#+DATE:") {
let value = rest.trim().to_string();
metadata.created_at = Some(value.clone());
additional.insert(Cow::Borrowed("date"), serde_json::json!(value));
} else if let Some(rest) = trimmed.strip_prefix("#+KEYWORDS:") {
let value = rest.trim();
let keywords: Vec<&str> = value.split(',').map(|s| s.trim()).collect();
additional.insert(Cow::Borrowed("keywords"), serde_json::json!(keywords));
} else if let Some(rest) = trimmed.strip_prefix("#+")
&& let Some((key, val)) = rest.split_once(':')
{
let key_lower = key.trim().to_lowercase();
let value = val.trim();
if !key_lower.is_empty() && !value.is_empty() {
additional.insert(Cow::Owned(format!("directive_{}", key_lower)), serde_json::json!(value));
}
}
}
metadata.additional = additional;
let content = Self::extract_content(org);
(metadata, content)
}
fn extract_content(org: &Org) -> String {
let mut content = String::new();
Self::extract_org_tree(org, &mut content);
content.trim().to_string()
}
fn extract_org_tree(org: &Org, content: &mut String) {
let heading = org.heading();
if !heading.is_empty() {
content.push_str("# ");
content.push_str(heading);
content.push('\n');
}
let lines = org.content_as_ref();
if !lines.is_empty() {
for line in lines {
let trimmed = line.trim();
if !trimmed.is_empty() {
content.push_str(trimmed);
content.push('\n');
}
}
content.push('\n');
}
let subtrees = org.subtrees_as_ref();
for subtree in subtrees {
Self::extract_org_tree(subtree, content);
}
}
fn extract_tables(org: &Org) -> Vec<Table> {
let mut tables = Vec::new();
Self::extract_tables_from_tree(org, &mut tables);
tables
}
fn extract_tables_from_tree(org: &Org, tables: &mut Vec<Table>) {
let lines = org.content_as_ref();
if !lines.is_empty() {
let mut in_table = false;
let mut current_table: Vec<Vec<String>> = Vec::new();
for line in lines {
let trimmed = line.trim();
if trimmed.starts_with('|') && trimmed.ends_with('|') {
in_table = true;
let cells: Vec<String> = trimmed
.split('|')
.map(|cell| cell.trim().to_string())
.filter(|cell| !cell.is_empty())
.collect();
if !cells.is_empty() {
current_table.push(cells);
}
} else if in_table {
if !current_table.is_empty() {
let markdown = Self::cells_to_markdown(¤t_table);
tables.push(Table {
cells: current_table.clone(),
markdown,
page_number: 1,
bounding_box: None,
});
current_table.clear();
}
in_table = false;
}
}
if !current_table.is_empty() {
let markdown = Self::cells_to_markdown(¤t_table);
tables.push(Table {
cells: current_table,
markdown,
page_number: 1,
bounding_box: None,
});
}
}
let subtrees = org.subtrees_as_ref();
for subtree in subtrees {
Self::extract_tables_from_tree(subtree, tables);
}
}
fn parse_inline_markup(raw: &str) -> (String, Vec<TextAnnotation>) {
let mut out = String::with_capacity(raw.len());
let mut annotations = Vec::new();
let bytes = raw.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len {
if i + 1 < len
&& bytes[i] == b'['
&& bytes[i + 1] == b'['
&& let Some((url, display, consumed_to)) = Self::parse_org_link(raw, i)
{
let start = out.len() as u32;
out.push_str(&display);
let end = out.len() as u32;
if start < end {
annotations.push(TextAnnotation {
start,
end,
kind: AnnotationKind::Link { url, title: None },
});
}
i = consumed_to;
continue;
}
if bytes[i].is_ascii() && Self::is_org_markup_char(bytes[i]) {
let marker = bytes[i];
let preceded_ok =
i == 0 || bytes[i - 1].is_ascii_whitespace() || bytes[i - 1] == b'(' || bytes[i - 1] == b'"';
if preceded_ok
&& i + 1 < len
&& !bytes[i + 1].is_ascii_whitespace()
&& let Some(close) = Self::find_org_markup_close(bytes, i + 1, marker)
{
let inner = &raw[i + 1..close];
let start = out.len() as u32;
out.push_str(inner);
let end_off = out.len() as u32;
let kind = match marker {
b'*' => AnnotationKind::Bold,
b'/' => AnnotationKind::Italic,
b'_' => AnnotationKind::Underline,
b'=' | b'~' => AnnotationKind::Code,
b'+' => AnnotationKind::Strikethrough,
_ => unreachable!(),
};
if start < end_off {
annotations.push(TextAnnotation {
start,
end: end_off,
kind,
});
}
i = close + 1;
continue;
}
}
let ch = &raw[i..];
let c = ch.chars().next().unwrap();
out.push(c);
i += c.len_utf8();
}
(out, annotations)
}
fn is_org_markup_char(b: u8) -> bool {
matches!(b, b'*' | b'/' | b'_' | b'=' | b'~' | b'+')
}
fn find_org_markup_close(bytes: &[u8], from: usize, marker: u8) -> Option<usize> {
let mut j = from;
while j < bytes.len() {
if bytes[j] == marker && j > from && !bytes[j - 1].is_ascii_whitespace() {
if j + 1 >= bytes.len()
|| bytes[j + 1].is_ascii_whitespace()
|| bytes[j + 1] == b'.'
|| bytes[j + 1] == b','
|| bytes[j + 1] == b';'
|| bytes[j + 1] == b':'
|| bytes[j + 1] == b')'
|| bytes[j + 1] == b']'
|| bytes[j + 1] == b'"'
{
return Some(j);
}
}
j += 1;
}
None
}
fn parse_org_link(text: &str, start: usize) -> Option<(String, String, usize)> {
if !text[start..].starts_with("[[") {
return None;
}
let after_open = start + 2;
let rest = &text[after_open..];
if let Some(desc_start) = rest.find("][") {
let url = &rest[..desc_start];
let desc_begin = after_open + desc_start + 2;
if let Some(close) = text[desc_begin..].find("]]") {
let description = &text[desc_begin..desc_begin + close];
return Some((url.to_string(), description.to_string(), desc_begin + close + 2));
}
} else if let Some(close) = rest.find("]]") {
let url = &rest[..close];
return Some((url.to_string(), url.to_string(), after_open + close + 2));
}
None
}
fn find_footnote_references(line: &str) -> Vec<String> {
let mut refs = Vec::new();
let mut search_from = 0;
while let Some(pos) = line[search_from..].find("[fn:") {
let abs_pos = search_from + pos;
if let Some(close) = line[abs_pos..].find(']') {
let label = &line[abs_pos + 4..abs_pos + close];
if !label.is_empty() {
refs.push(label.to_string());
}
search_from = abs_pos + close + 1;
} else {
break;
}
}
refs
}
fn build_document_structure(org_text: &str) -> DocumentStructure {
let mut builder = DocumentStructureBuilder::new().source_format("orgmode");
let lines: Vec<&str> = org_text.lines().collect();
let mut i = 0;
let mut metadata_entries: Vec<(String, String)> = Vec::new();
while i < lines.len() {
let trimmed = lines[i].trim();
if let Some(rest) = trimmed.strip_prefix("#+") {
if let Some((key, val)) = rest.split_once(':') {
let key_upper = key.trim().to_uppercase();
let value = val.trim().to_string();
if !value.is_empty() {
metadata_entries.push((key_upper, value));
}
}
i += 1;
continue;
}
if !trimmed.is_empty() {
break;
}
i += 1;
}
if !metadata_entries.is_empty() {
builder.push_metadata_block(metadata_entries, None);
}
while i < lines.len() {
let trimmed = lines[i].trim();
if trimmed.starts_with("#+") && !trimmed.starts_with("#+BEGIN") && !trimmed.starts_with("#+END") {
i += 1;
continue;
}
if trimmed == ":PROPERTIES:" {
let mut props: Vec<(String, String)> = Vec::new();
i += 1;
while i < lines.len() {
let pt = lines[i].trim();
if pt == ":END:" {
i += 1;
break;
}
if pt.starts_with(':')
&& pt.len() > 1
&& let Some(colon2) = pt[1..].find(':')
{
let key = pt[1..1 + colon2].to_string();
let value = pt[2 + colon2..].trim().to_string();
if !key.is_empty() {
props.push((key, value));
}
}
i += 1;
}
if !props.is_empty() {
builder.push_metadata_block(props, None);
}
continue;
}
if trimmed.starts_with('*') {
let mut level: u8 = 0;
for ch in trimmed.chars() {
if ch == '*' {
level += 1;
} else {
break;
}
}
if level > 0 && trimmed.len() > level as usize && trimmed.as_bytes()[level as usize] == b' ' {
let raw_heading = trimmed[level as usize + 1..].trim();
if !raw_heading.is_empty() {
let todo_keywords = ["TODO", "DONE", "NEXT", "WAITING", "CANCELLED", "CANCELED"];
let mut heading_text = raw_heading;
let mut todo_keyword: Option<&str> = None;
let mut tags: Option<String> = None;
for kw in &todo_keywords {
if heading_text.starts_with(kw) {
let after = &heading_text[kw.len()..];
if after.is_empty() || after.starts_with(' ') {
todo_keyword = Some(kw);
heading_text = after.trim_start();
break;
}
}
}
if let Some(tag_start) = heading_text.rfind(" :") {
let potential_tags = &heading_text[tag_start + 1..];
if potential_tags.ends_with(':')
&& potential_tags.len() > 2
&& potential_tags[1..potential_tags.len() - 1]
.chars()
.all(|c| c.is_alphanumeric() || c == ':' || c == '_' || c == '@')
{
tags = Some(potential_tags.to_string());
heading_text = heading_text[..tag_start].trim_end();
}
}
let heading_idx = builder.push_heading(level, heading_text, None, None);
if todo_keyword.is_some() || tags.is_some() {
let mut attrs = AHashMap::new();
if let Some(kw) = todo_keyword {
attrs.insert("todo".to_string(), kw.to_string());
}
if let Some(t) = tags {
attrs.insert("tags".to_string(), t);
}
builder.set_attributes(heading_idx, attrs);
}
}
i += 1;
continue;
}
}
if trimmed.starts_with("#+BEGIN_SRC") || trimmed.starts_with("#+begin_src") {
let language: Option<&str> = trimmed.split_whitespace().nth(1);
i += 1;
let mut code_content = String::new();
while i < lines.len() {
let t = lines[i].trim();
if t.starts_with("#+END_SRC") || t.starts_with("#+end_src") {
i += 1;
break;
}
if !code_content.is_empty() {
code_content.push('\n');
}
code_content.push_str(lines[i]);
i += 1;
}
builder.push_code(code_content.trim_end(), language, None);
continue;
}
if trimmed.starts_with("#+BEGIN_QUOTE") || trimmed.starts_with("#+begin_quote") {
builder.push_quote(None);
i += 1;
while i < lines.len() {
let t = lines[i].trim();
if t.starts_with("#+END_QUOTE") || t.starts_with("#+end_quote") {
i += 1;
break;
}
if !t.is_empty() {
builder.push_paragraph(t, vec![], None, None);
}
i += 1;
}
builder.exit_container();
continue;
}
if trimmed.starts_with("#+BEGIN_") || trimmed.starts_with("#+begin_") {
let block_type = trimmed
.split_whitespace()
.next()
.unwrap_or("")
.strip_prefix("#+BEGIN_")
.or_else(|| trimmed.split_whitespace().next().unwrap_or("").strip_prefix("#+begin_"))
.unwrap_or("UNKNOWN")
.to_string();
let end_marker_upper = format!("#+END_{}", block_type);
let end_marker_lower = end_marker_upper.to_lowercase();
i += 1;
let mut block_content = String::new();
while i < lines.len() {
let t = lines[i].trim();
if t.starts_with(&end_marker_upper) || t.starts_with(&end_marker_lower) {
i += 1;
break;
}
if !block_content.is_empty() {
block_content.push('\n');
}
block_content.push_str(lines[i]);
i += 1;
}
builder.push_raw_block("orgmode", block_content.trim_end(), None);
continue;
}
if trimmed.starts_with('|') && trimmed.ends_with('|') {
let mut table_cells: Vec<Vec<String>> = Vec::new();
while i < lines.len() {
let t = lines[i].trim();
if !t.starts_with('|') || !t.ends_with('|') {
break;
}
if t.contains("---") || t.contains("+-") {
i += 1;
continue;
}
let cells: Vec<String> = t
.split('|')
.map(|cell| cell.trim().to_string())
.filter(|cell| !cell.is_empty())
.collect();
if !cells.is_empty() {
table_cells.push(cells);
}
i += 1;
}
if !table_cells.is_empty() {
builder.push_table_from_cells(&table_cells, None);
}
continue;
}
if !trimmed.is_empty() && Self::is_org_list_item(trimmed) {
let is_ordered = Self::is_org_ordered_item(trimmed);
let list_idx = builder.push_list(is_ordered, None);
while i < lines.len() {
let t = lines[i].trim();
if t.is_empty() || !Self::is_org_list_item(t) {
break;
}
let text = Self::strip_list_prefix(t);
let (item_text, checkbox_state) = if let Some(rest) = text.strip_prefix("[ ] ") {
(rest, Some("unchecked"))
} else if let Some(rest) = text.strip_prefix("[x] ").or_else(|| text.strip_prefix("[X] ")) {
(rest, Some("checked"))
} else {
(text, None)
};
let item_idx = builder.push_list_item(list_idx, item_text, None);
if let Some(state) = checkbox_state {
let mut attrs = AHashMap::new();
attrs.insert("checkbox".to_string(), state.to_string());
builder.set_attributes(item_idx, attrs);
}
i += 1;
}
continue;
}
if !trimmed.is_empty() {
let footnote_refs = Self::find_footnote_references(trimmed);
let (stripped, annotations) = Self::parse_inline_markup(trimmed);
builder.push_paragraph(&stripped, annotations, None, None);
for fref in footnote_refs {
builder.push_footnote(&format!("[fn:{}]", fref), None);
}
}
i += 1;
}
builder.build()
}
fn is_org_list_item(line: &str) -> bool {
let t = line.trim_start();
if t.starts_with("- ") || t.starts_with("+ ") {
return true;
}
if let Some(space_pos) = t.find(' ')
&& space_pos > 0
&& space_pos < 5
{
let prefix = &t[..space_pos];
if (prefix.ends_with('.') || prefix.ends_with(')'))
&& prefix[..prefix.len() - 1].chars().all(|c| c.is_numeric())
{
return true;
}
}
false
}
fn is_org_ordered_item(line: &str) -> bool {
let t = line.trim_start();
if let Some(space_pos) = t.find(' ')
&& space_pos > 0
&& space_pos < 5
{
let prefix = &t[..space_pos];
return (prefix.ends_with('.') || prefix.ends_with(')'))
&& prefix[..prefix.len() - 1].chars().all(|c| c.is_numeric());
}
false
}
fn strip_list_prefix(line: &str) -> &str {
let t = line.trim_start();
if let Some(rest) = t.strip_prefix("- ").or_else(|| t.strip_prefix("+ ")) {
return rest;
}
if let Some(space_pos) = t.find(' ') {
return &t[space_pos + 1..];
}
t
}
fn cells_to_markdown(cells: &[Vec<String>]) -> String {
if cells.is_empty() {
return String::new();
}
let mut md = String::new();
for (row_idx, row) in cells.iter().enumerate() {
md.push('|');
for cell in row {
md.push(' ');
md.push_str(cell);
md.push_str(" |");
}
md.push('\n');
if row_idx == 0 && cells.len() > 1 {
md.push('|');
for _ in row {
md.push_str(" --- |");
}
md.push('\n');
}
}
md
}
}
#[cfg(feature = "office")]
impl Default for OrgModeExtractor {
fn default() -> Self {
Self::new()
}
}
#[cfg(feature = "office")]
impl Plugin for OrgModeExtractor {
fn name(&self) -> &str {
"orgmode-extractor"
}
fn version(&self) -> String {
env!("CARGO_PKG_VERSION").to_string()
}
fn initialize(&self) -> Result<()> {
Ok(())
}
fn shutdown(&self) -> Result<()> {
Ok(())
}
fn description(&self) -> &str {
"Native Rust extractor for Org Mode documents with comprehensive metadata extraction"
}
fn author(&self) -> &str {
"Kreuzberg Team"
}
}
#[cfg(feature = "office")]
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
impl DocumentExtractor for OrgModeExtractor {
#[cfg_attr(
feature = "otel",
tracing::instrument(
skip(self, content, config),
fields(
extractor.name = self.name(),
content.size_bytes = content.len(),
)
)
)]
async fn extract_bytes(
&self,
content: &[u8],
mime_type: &str,
config: &ExtractionConfig,
) -> Result<ExtractionResult> {
let org_text = String::from_utf8_lossy(content).into_owned();
let lines: Vec<String> = org_text.lines().map(|s| s.to_string()).collect();
let org = Org::from_vec(&lines)?;
let (metadata, extracted_content) = Self::extract_metadata_and_content(&org_text, &org);
let tables = Self::extract_tables(&org);
let document = if config.include_document_structure {
Some(Self::build_document_structure(&org_text))
} else {
None
};
Ok(ExtractionResult {
content: extracted_content,
mime_type: mime_type.to_string().into(),
metadata,
tables,
detected_languages: None,
chunks: None,
images: None,
djot_content: None,
pages: None,
elements: None,
ocr_elements: None,
document,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
annotations: None,
children: None,
})
}
fn supported_mime_types(&self) -> &[&str] {
&["text/x-org", "text/org", "application/x-org"]
}
fn priority(&self) -> i32 {
50
}
}
#[cfg(all(test, feature = "office"))]
mod tests {
use super::*;
#[test]
fn test_orgmode_extractor_plugin_interface() {
let extractor = OrgModeExtractor::new();
assert_eq!(extractor.name(), "orgmode-extractor");
assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
assert_eq!(extractor.priority(), 50);
assert!(!extractor.supported_mime_types().is_empty());
}
#[test]
fn test_orgmode_extractor_supports_text_x_org() {
let extractor = OrgModeExtractor::new();
assert!(extractor.supported_mime_types().contains(&"text/x-org"));
}
#[test]
fn test_orgmode_extractor_default() {
let extractor = OrgModeExtractor;
assert_eq!(extractor.name(), "orgmode-extractor");
}
#[test]
fn test_orgmode_extractor_initialize_shutdown() {
let extractor = OrgModeExtractor::new();
assert!(extractor.initialize().is_ok());
assert!(extractor.shutdown().is_ok());
}
#[test]
fn test_extract_metadata_with_title() {
let org_text = "#+TITLE: Test Document\n\nContent here.";
let lines: Vec<String> = org_text.lines().map(|s| s.to_string()).collect();
let org = Org::from_vec(&lines).expect("Failed to parse org");
let (metadata, _) = OrgModeExtractor::extract_metadata_and_content(org_text, &org);
assert!(metadata.additional.get("title").and_then(|v| v.as_str()).is_some());
}
#[test]
fn test_extract_metadata_with_author() {
let org_text = "#+AUTHOR: John Doe\n\nContent here.";
let lines: Vec<String> = org_text.lines().map(|s| s.to_string()).collect();
let org = Org::from_vec(&lines).expect("Failed to parse org");
let (metadata, _) = OrgModeExtractor::extract_metadata_and_content(org_text, &org);
assert!(metadata.additional.get("author").and_then(|v| v.as_str()).is_some());
}
#[test]
fn test_extract_metadata_with_date() {
let org_text = "#+DATE: 2024-01-15\n\nContent here.";
let lines: Vec<String> = org_text.lines().map(|s| s.to_string()).collect();
let org = Org::from_vec(&lines).expect("Failed to parse org");
let (metadata, _) = OrgModeExtractor::extract_metadata_and_content(org_text, &org);
assert_eq!(metadata.created_at, Some("2024-01-15".to_string()));
}
#[test]
fn test_extract_metadata_with_keywords() {
let org_text = "#+KEYWORDS: rust, org-mode, parsing\n\nContent here.";
let lines: Vec<String> = org_text.lines().map(|s| s.to_string()).collect();
let org = Org::from_vec(&lines).expect("Failed to parse org");
let (metadata, _) = OrgModeExtractor::extract_metadata_and_content(org_text, &org);
let keywords = metadata.additional.get("keywords").and_then(|v| v.as_array());
assert!(keywords.is_some());
}
#[test]
fn test_extract_content_with_headings() {
let org_text = "* Heading 1\n\nSome content.\n\n** Heading 2\n\nMore content.";
let lines: Vec<String> = org_text.lines().map(|s| s.to_string()).collect();
let org = Org::from_vec(&lines).expect("Failed to parse org");
let content = OrgModeExtractor::extract_content(&org);
assert!(content.contains("Heading 1"));
assert!(content.contains("Heading 2"));
assert!(content.contains("Some content"));
assert!(content.contains("More content"));
}
#[test]
fn test_extract_content_with_paragraphs() {
let org_text = "First paragraph.\n\nSecond paragraph.";
let lines: Vec<String> = org_text.lines().map(|s| s.to_string()).collect();
let org = Org::from_vec(&lines).expect("Failed to parse org");
let content = OrgModeExtractor::extract_content(&org);
assert!(content.contains("First paragraph"));
assert!(content.contains("Second paragraph"));
}
#[test]
fn test_extract_content_with_lists() {
let org_text = "- Item 1\n- Item 2\n- Item 3";
let lines: Vec<String> = org_text.lines().map(|s| s.to_string()).collect();
let org = Org::from_vec(&lines).expect("Failed to parse org");
let content = OrgModeExtractor::extract_content(&org);
assert!(content.contains("Item 1"));
assert!(content.contains("Item 2"));
assert!(content.contains("Item 3"));
}
#[test]
fn test_cells_to_markdown_format() {
let cells = vec![
vec!["Name".to_string(), "Age".to_string()],
vec!["Alice".to_string(), "30".to_string()],
vec!["Bob".to_string(), "25".to_string()],
];
let markdown = OrgModeExtractor::cells_to_markdown(&cells);
assert!(markdown.contains("Name"));
assert!(markdown.contains("Age"));
assert!(markdown.contains("Alice"));
assert!(markdown.contains("Bob"));
assert!(markdown.contains("---"));
}
#[test]
fn test_orgmode_extractor_supported_mime_types() {
let extractor = OrgModeExtractor::new();
let supported = extractor.supported_mime_types();
assert!(supported.contains(&"text/x-org"));
}
#[test]
fn test_link_with_description() {
let org_text = r#"* Links Test
[[http://att.com/][AT&T]]
"#;
let lines: Vec<String> = org_text.lines().map(|s| s.to_string()).collect();
let org = Org::from_vec(&lines).expect("Failed to parse org");
let content = OrgModeExtractor::extract_content(&org);
assert!(content.contains("AT&T"), "Should contain link description 'AT&T'");
}
#[test]
fn test_link_without_description() {
let org_text = r#"* Links Test
[[https://example.com]]
"#;
let lines: Vec<String> = org_text.lines().map(|s| s.to_string()).collect();
let org = Org::from_vec(&lines).expect("Failed to parse org");
let content = OrgModeExtractor::extract_content(&org);
assert!(
content.contains("example.com"),
"Should contain link path when no description provided"
);
}
#[test]
fn test_link_with_ampersand_in_description() {
let org_text = r#"* Company Links
[[http://att.com/][AT&T Company]]
"#;
let lines: Vec<String> = org_text.lines().map(|s| s.to_string()).collect();
let org = Org::from_vec(&lines).expect("Failed to parse org");
let content = OrgModeExtractor::extract_content(&org);
assert!(
content.contains("AT&T"),
"Should preserve ampersand in link description"
);
}
#[test]
fn test_multiple_links_with_mixed_descriptions() {
let org_text = r#"* Multiple Links
[[https://example.com][Example Link]]
[[https://example.org]]
[[mailto:test@example.com][Contact]]
"#;
let lines: Vec<String> = org_text.lines().map(|s| s.to_string()).collect();
let org = Org::from_vec(&lines).expect("Failed to parse org");
let content = OrgModeExtractor::extract_content(&org);
assert!(content.contains("Example Link"));
assert!(content.contains("example.org"));
assert!(content.contains("Contact"));
}
#[test]
fn test_link_description_priority_over_url() {
let org_text = r#"[[http://att.com/][AT&T]]"#;
let lines: Vec<String> = org_text.lines().map(|s| s.to_string()).collect();
let org = Org::from_vec(&lines).expect("Failed to parse org");
let content = OrgModeExtractor::extract_content(&org);
assert!(content.contains("AT&T"), "Description should be prioritized over URL");
assert!(
content.contains("[AT&T]"),
"Link should be formatted as [description] when description exists"
);
}
#[test]
fn test_emoji_and_cjk_with_inline_markup() {
let (text, annotations) = OrgModeExtractor::parse_inline_markup("π *ε€ͺε* γγΉγ");
assert!(text.contains("π"), "Emoji preserved");
assert!(text.contains("ε€ͺε"), "Bold content present");
assert!(text.contains("γγΉγ"), "Trailing CJK preserved");
assert!(!annotations.is_empty(), "Should have bold annotation");
}
#[test]
fn test_cjk_heading_with_markup() {
let org_text = "* θ¦εΊγ\n\nπ *ε€ͺε* γγΉγ";
let lines: Vec<String> = org_text.lines().map(|s| s.to_string()).collect();
let org = Org::from_vec(&lines).expect("Failed to parse org");
let content = OrgModeExtractor::extract_content(&org);
assert!(content.contains("θ¦εΊγ"), "CJK heading preserved");
assert!(content.contains("ε€ͺε"), "Bold CJK text present");
}
}