use super::{Parser, PlainEnglish};
use crate::{Span, Token, TokenKind};
#[derive(Debug, PartialEq, Copy, Clone)]
enum SourceBlockMarker {
Begin,
End,
}
fn is_header_line(chars: &[char], start: usize) -> bool {
chars.get(start).is_some_and(|c| *c == '*')
}
fn is_source_block_marker(chars: &[char], start: usize) -> Option<SourceBlockMarker> {
let line = get_line_from_start(chars, start);
let line_str: String = line.iter().collect();
let line_str = line_str.trim();
if line_str.starts_with("#+BEGIN_SRC") || line_str.starts_with("#+begin_src") {
Some(SourceBlockMarker::Begin)
} else if line_str.starts_with("#+END_SRC") || line_str.starts_with("#+end_src") {
Some(SourceBlockMarker::End)
} else {
None
}
}
fn is_directive(chars: &[char], start: usize) -> bool {
if start + 1 >= chars.len() {
return false;
}
chars[start] == '#' && chars[start + 1] == '+'
}
fn is_list_item(chars: &[char], start: usize) -> bool {
let mut pos = start;
while pos < chars.len() && (chars[pos] == ' ' || chars[pos] == '\t') {
pos += 1;
}
if pos >= chars.len() {
return false;
}
if (chars[pos] == '-' || chars[pos] == '+') && pos + 1 < chars.len() && chars[pos + 1] == ' ' {
return true;
}
if chars[pos].is_ascii_digit() {
let mut num_pos = pos;
while num_pos < chars.len() && chars[num_pos].is_ascii_digit() {
num_pos += 1;
}
if num_pos < chars.len()
&& (chars[num_pos] == '.' || chars[num_pos] == ')')
&& num_pos + 1 < chars.len()
&& chars[num_pos + 1] == ' '
{
return true;
}
}
false
}
fn normalize_list_item_whitespace(chars: &[char]) -> Vec<char> {
let mut result = Vec::new();
let mut init_list = false;
for &ch in chars {
if !init_list && ch == '\t' {
result.push(' ');
init_list = true;
} else {
result.push(ch);
}
}
result
}
fn get_line_from_start(chars: &[char], start: usize) -> &[char] {
let mut end = start;
while end < chars.len() && chars[end] != '\n' {
end += 1;
}
&chars[start..end]
}
fn find_line_end(chars: &[char], start: usize) -> usize {
let mut pos = start;
while pos < chars.len() && chars[pos] != '\n' {
pos += 1;
}
if pos < chars.len() && chars[pos] == '\n' {
pos + 1 } else {
pos
}
}
fn find_line_start(chars: &[char], pos: usize) -> usize {
let mut start = pos;
while start > 0 && chars[start - 1] != '\n' {
start -= 1;
}
start
}
#[derive(Default, Clone, Debug, Copy)]
pub struct OrgMode;
impl OrgMode {}
impl Parser for OrgMode {
fn parse(&self, source: &[char]) -> Vec<Token> {
let english_parser = PlainEnglish;
let mut tokens = Vec::new();
let mut cursor = 0;
let mut in_source_block = false;
while cursor < source.len() {
let line_start = find_line_start(source, cursor);
let source_marker = is_source_block_marker(source, line_start);
if let Some(marker) = source_marker {
in_source_block = marker == SourceBlockMarker::Begin;
}
if in_source_block || source_marker.is_some() {
let line_end = find_line_end(source, line_start);
tokens.push(Token {
span: Span::new(line_start, line_end),
kind: TokenKind::Unlintable,
});
cursor = line_end;
continue;
}
if is_header_line(source, line_start) {
let line_end = find_line_end(source, line_start);
let mut header_text_start = line_start;
while header_text_start < line_end
&& (source[header_text_start] == '*' || source[header_text_start] == ' ')
{
header_text_start += 1;
}
if header_text_start < line_end {
let mut header_tokens =
english_parser.parse(&source[header_text_start..line_end]);
header_tokens
.iter_mut()
.for_each(|token| token.span.push_by(header_text_start));
tokens.append(&mut header_tokens);
}
tokens.push(Token {
span: Span::empty(line_end.saturating_sub(1)),
kind: TokenKind::ParagraphBreak,
});
cursor = line_end;
continue;
}
if is_directive(source, line_start) {
let line_end = find_line_end(source, line_start);
tokens.push(Token {
span: Span::new(line_start, line_end),
kind: TokenKind::Unlintable,
});
cursor = line_end;
continue;
}
if is_list_item(source, line_start) {
let line_end = find_line_end(source, line_start);
let line_chars = &source[line_start..line_end];
let normalized_chars = normalize_list_item_whitespace(line_chars);
let mut line_tokens = english_parser.parse(&normalized_chars);
line_tokens
.iter_mut()
.for_each(|token| token.span.push_by(line_start));
tokens.append(&mut line_tokens);
cursor = line_end;
continue;
}
let line_end = find_line_end(source, cursor);
if cursor < line_end {
let mut line_tokens = english_parser.parse(&source[cursor..line_end]);
line_tokens
.iter_mut()
.for_each(|token| token.span.push_by(cursor));
tokens.append(&mut line_tokens);
}
cursor = line_end;
}
if matches!(
tokens.last(),
Some(Token {
kind: TokenKind::Newline(_) | TokenKind::ParagraphBreak,
..
})
) && source.last() != Some(&'\n')
{
tokens.pop();
}
tokens
}
}
#[cfg(test)]
mod tests {
use super::super::StrParser;
use super::OrgMode;
use crate::TokenKind;
#[test]
fn simple_text() {
let source = "This is simple text.";
let tokens = OrgMode.parse_str(source);
assert!(!tokens.is_empty());
assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Word(_))));
}
#[test]
fn header_parsing() {
let source = "* This is a header\nThis is regular text.";
let tokens = OrgMode.parse_str(source);
let token_kinds: Vec<_> = tokens.iter().map(|t| &t.kind).collect();
assert!(token_kinds.iter().any(|k| matches!(k, TokenKind::Word(_))));
assert!(
token_kinds
.iter()
.any(|k| matches!(k, TokenKind::ParagraphBreak))
);
}
#[test]
fn multiple_level_headers() {
let source = "** Second level header\n*** Third level header";
let tokens = OrgMode.parse_str(source);
let token_kinds: Vec<_> = tokens.iter().map(|t| &t.kind).collect();
let word_count = token_kinds
.iter()
.filter(|k| matches!(k, TokenKind::Word(_)))
.count();
assert!(word_count >= 4); }
#[test]
fn source_block_unlintable() {
let source = r#"Regular text.
#+BEGIN_SRC rust
fn main() {
println!("Hello, world!");
}
#+END_SRC
More regular text."#;
let tokens = OrgMode.parse_str(source);
let unlintable_count = tokens
.iter()
.filter(|t| matches!(t.kind, TokenKind::Unlintable))
.count();
assert!(unlintable_count > 0);
assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Word(_))));
}
#[test]
fn directive_unlintable() {
let source = r#"#+TITLE: My Document
#+AUTHOR: Test Author
This is regular text."#;
let tokens = OrgMode.parse_str(source);
let unlintable_count = tokens
.iter()
.filter(|t| matches!(t.kind, TokenKind::Unlintable))
.count();
assert_eq!(unlintable_count, 2);
assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Word(_))));
}
#[test]
fn case_insensitive_source_blocks() {
let source = r#"#+begin_src python
print("hello")
#+end_src"#;
let tokens = OrgMode.parse_str(source);
let unlintable_count = tokens
.iter()
.filter(|t| matches!(t.kind, TokenKind::Unlintable))
.count();
assert_eq!(unlintable_count, 3);
}
#[test]
fn empty_header() {
let source = "*\nRegular text.";
let tokens = OrgMode.parse_str(source);
assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Word(_))));
}
#[test]
fn no_trailing_newline() {
let source = "Simple text without newline";
let tokens = OrgMode.parse_str(source);
assert!(!tokens.last().unwrap().kind.is_newline());
}
#[test]
fn list_items_with_tabs() {
let source = "- First item\n\t- Indented with tab\n+ Second item\n1. Numbered item";
let tokens = OrgMode.parse_str(source);
assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Word(_))));
let unlintable_count = tokens
.iter()
.filter(|t| matches!(t.kind, TokenKind::Unlintable))
.count();
assert_eq!(unlintable_count, 0);
}
#[test]
fn mixed_list_formats() {
let source = r#"- Bullet item
1. Numbered item
+ Plus item
2) Parenthesis numbered"#;
let tokens = OrgMode.parse_str(source);
let word_count = tokens
.iter()
.filter(|t| matches!(t.kind, TokenKind::Word(_)))
.count();
assert!(word_count == 8, "{:?}", tokens); }
}