1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
//! Generic lexer framework for tokenizing input streams
//!
//! This module provides the core tokenization infrastructure that can be
//! specialized for different RDF formats.
use crate::error::{TextPosition, TokenRecognizerError};
use std::io::BufRead;
/// A token recognizer that converts byte streams into tokens
pub trait TokenRecognizer {
/// The token type produced by this recognizer
type Token<'a>;
/// Configuration options for tokenization
type Options: Default;
/// Recognize the next token from input data
///
/// Returns:
/// - `Some((consumed_bytes, Ok(token)))` for successful recognition
/// - `Some((consumed_bytes, Err(error)))` for tokenization errors
/// - `None` if more data is needed
fn recognize_next_token<'a>(
&mut self,
data: &'a [u8],
is_ending: bool,
options: &Self::Options,
) -> Option<(usize, Result<Self::Token<'a>, TokenRecognizerError>)>;
/// Reset the recognizer state
fn reset(&mut self) {}
}
/// A token or a line jump in the input
#[derive(Debug, Clone, PartialEq)]
pub enum TokenOrLineJump<T> {
/// A regular token
Token(T),
/// A line jump (newline character)
LineJump,
}
/// A streaming tokenizer that processes input incrementally
pub struct StreamingTokenizer<R, T: TokenRecognizer> {
recognizer: T,
reader: R,
buffer: Vec<u8>,
position: TextPosition,
options: T::Options,
buffer_capacity: usize,
}
impl<R: BufRead, T: TokenRecognizer> StreamingTokenizer<R, T> {
/// Create a new streaming tokenizer
pub fn new(recognizer: T, reader: R) -> Self {
Self {
recognizer,
reader,
buffer: Vec::new(),
position: TextPosition::start(),
options: T::Options::default(),
buffer_capacity: 8192, // 8KB default buffer
}
}
/// Create a new streaming tokenizer with custom options
pub fn with_options(recognizer: T, reader: R, options: T::Options) -> Self {
Self {
recognizer,
reader,
buffer: Vec::new(),
position: TextPosition::start(),
options,
buffer_capacity: 8192,
}
}
/// Set the buffer capacity
pub fn with_buffer_capacity(mut self, capacity: usize) -> Self {
self.buffer_capacity = capacity;
self
}
/// Get the current position in the input
pub fn position(&self) -> TextPosition {
self.position
}
}
impl<R: BufRead, T: TokenRecognizer> Iterator for StreamingTokenizer<R, T> {
type Item = Result<TokenOrLineJump<T::Token<'static>>, TokenRecognizerError>;
fn next(&mut self) -> Option<Self::Item> {
loop {
// Clone buffer data to avoid borrowing issues
let buffer_data = self.buffer.clone();
// Try to recognize a token from current buffer
let recognition_result = self.recognizer.recognize_next_token(
&buffer_data,
false, // We'll handle EOF separately
&self.options,
);
if let Some((consumed, result)) = recognition_result {
// Update position and consume bytes
self.position.advance_bytes(&self.buffer[..consumed]);
self.buffer.drain(..consumed);
return Some(result.map(|token| {
// Convert token to owned version
// This is a simplification - in practice we'd need trait bounds
// for converting borrowed tokens to owned tokens
unsafe { std::mem::transmute(TokenOrLineJump::Token(token)) }
}));
}
// Need more data - read from input
let mut chunk = vec![0u8; self.buffer_capacity];
match self.reader.read(&mut chunk) {
Ok(0) => {
// EOF - try to recognize with ending flag
if !self.buffer.is_empty() {
let buffer_data = self.buffer.clone();
let eof_recognition_result =
self.recognizer
.recognize_next_token(&buffer_data, true, &self.options);
if let Some((consumed, result)) = eof_recognition_result {
self.position.advance_bytes(&self.buffer[..consumed]);
self.buffer.drain(..consumed);
return Some(result.map(|token| unsafe {
std::mem::transmute(TokenOrLineJump::Token(token))
}));
}
}
return None; // True EOF
}
Ok(n) => {
chunk.truncate(n);
self.buffer.extend_from_slice(&chunk);
}
Err(e) => {
return Some(Err(TokenRecognizerError::UnexpectedCharacter(
e.to_string().chars().next().unwrap_or('?'),
)));
}
}
}
}
}
/// A simple tokenizer for line-based formats (N-Triples, N-Quads)
pub struct LineTokenizer<R> {
reader: R,
position: TextPosition,
}
impl<R: BufRead> LineTokenizer<R> {
/// Create a new line tokenizer
pub fn new(reader: R) -> Self {
Self {
reader,
position: TextPosition::start(),
}
}
/// Get the current position
pub fn position(&self) -> TextPosition {
self.position
}
}
impl<R: BufRead> Iterator for LineTokenizer<R> {
type Item = Result<String, std::io::Error>;
fn next(&mut self) -> Option<Self::Item> {
let mut line = String::new();
match self.reader.read_line(&mut line) {
Ok(0) => None, // EOF
Ok(_) => {
// Update position
for ch in line.chars() {
self.position.advance_char(ch);
}
// Remove trailing newline
if line.ends_with('\n') {
line.pop();
if line.ends_with('\r') {
line.pop();
}
}
Some(Ok(line))
}
Err(e) => Some(Err(e)),
}
}
}