markhor_core 0.1.0-alpha.0.2

Core library for Markhor, a project connecting AI models, documents, and workflows for knowledge work
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
use std::sync::Arc;

use crate::{chat::{chat::ChatApi, prompter::Prompter, ChatError}, chunking::Chunker, convert::{ConversionError, Converter}, embedding::{Embedder, EmbeddingError}, extension::{ActiveExtension, Extension, F11y, UseExtensionError}, storage::{self, Content, Document, Folder, Scope}};
use mime::Mime;
use thiserror::Error;
use tokio::{io::AsyncRead, sync::mpsc::{error::SendError, UnboundedReceiver, UnboundedSender}, task::JoinHandle};
use tracing::instrument;

pub mod search;
mod chat;

pub use chat::{chat, simple_rag};

/// A unit of work that can be executed asynchronously.
/// 
/// A job combines an asynchronous function with documents and extensions. The function is given
/// access to these assets and can use them to perform some work.
pub struct Job<T, F: AsyncFnOnce(&mut Assets) -> Result<T, RunJobError> + Send> {
    callback: F,
    documents: Vec<Document>,
    scopes: Vec<Scope>,
    asset_channel: Option<(AssetSender, UnboundedReceiver<AssetItem>)>,
    extensions: Arc<Vec<ActiveExtension>>,
}

impl<T, F: AsyncFnOnce(&mut Assets) -> Result<T, RunJobError> + Send> Job<T, F> {
    /// Create a new job with the given callback function.
    pub fn new(callback: F) -> Self {
        Self {
            callback,
            documents: Vec::new(),
            scopes: Vec::new(),
            asset_channel: None,
            extensions: Arc::new(Vec::new()),
        }
    }

    pub fn and_then<T2, C: AsyncFnOnce(&mut Assets, T) -> Result<T2, RunJobError> + Send>(self, callback: C) -> Job<T2, impl AsyncFnOnce(&mut Assets) -> Result<T2, RunJobError> + Send> {
        let callback0 = self.callback;
        Job {
            callback: async move |assets| {
                let result0 = callback0(assets).await?;
                callback(assets, result0).await
            },
            documents: self.documents,
            scopes: self.scopes,
            asset_channel: self.asset_channel,
            extensions: self.extensions,
        }
    }

    pub fn and_chain<T2, F2: AsyncFnOnce(&mut Assets) -> Result<T2, RunJobError> + Send, C: FnOnce(T) -> Job<T2, F2> + Send>(self, callback: C) -> Job<T2, impl AsyncFnOnce(&mut Assets) -> Result<T2, RunJobError> + Send> {
        let callback0 = self.callback;
        Job {
            callback: async move |assets| {
                let result0 = callback0(assets).await?;
                let mut next_job = callback(result0)
                    // apply same extensions to the next job
                    .with_extensions(assets.extensions.iter().cloned());

                // Add assets to chained job
                for doc in assets.documents.drain(..) {
                    next_job.add_document(doc);
                }
                next_job.run().await
            },
            documents: self.documents,
            scopes: self.scopes,
            asset_channel: self.asset_channel,
            extensions: self.extensions,
        }
    }

    pub fn and_chain_async<T2, F2: AsyncFnOnce(&mut Assets) -> Result<T2, RunJobError> + Send, C: AsyncFnOnce(T) -> Job<T2, F2> + Send>(self, callback: C) -> Job<T2, impl AsyncFnOnce(&mut Assets) -> Result<T2, RunJobError> + Send> {
        let callback0 = self.callback;
        Job {
            callback: async move |assets| {
                let result0 = callback0(assets).await?;
                let mut next_job = callback(result0).await
                    // apply same extensions to the next job
                    .with_extensions(assets.extensions.iter().cloned());

                // Add assets to chained job
                for doc in assets.documents.drain(..) {
                    next_job.add_document(doc);
                }
                next_job.run().await
            },
            documents: self.documents,
            scopes: self.scopes,
            asset_channel: self.asset_channel,
            extensions: self.extensions,
        }
    }

    /// Configure the extensions available to the job.
    /// 
    /// Note that any `AssetSender`s created before this method is called will be invalidated.
    pub fn with_extensions<I: Iterator<Item = ActiveExtension>>(self, extensions: I) -> Self {
        Job {
            callback: self.callback,
            documents: self.documents,
            scopes: self.scopes,
            asset_channel: None,
            extensions: Arc::new(extensions.collect()),
        }
    }

    /// Add a document to the job's assets.
    pub fn add_document(&mut self, document: Document) -> &mut Self {
        self.documents.push(document);
        self
    }

    /// Add all documents in a folder to the job's assets.
    pub async fn add_folder(&mut self, folder: Folder) -> Result<&mut Self, storage::Error> {
        for doc in folder.list_documents().await? {
            self.add_document(doc);
        }
        for folder in folder.list_folders().await? {
            Box::pin(self.add_folder(folder)).await?;
        }
        Ok(self)
    }

    /// Get the assets available to the job.
    pub fn documents(&self) -> &[Document] {
        &self.documents
    }

    /// Get the scopes available to the job.
    pub fn scopes(&self) -> &[Scope] {
        &self.scopes
    }

    /// Get an asset sender for this job.
    /// 
    /// The asset sender can be used to send documents, folders, and extensions to the job's 
    /// assets. In this way, it is possible to add assets to the job after calling `Job::run` 
    /// (which consumes the job).
    /// 
    /// Note that any assets sent after the job has started will not be available to the callback
    /// function until it calls `Assets::refresh`.
    pub fn asset_sender(&mut self) -> AssetSender {
        if let Some(sender) = &self.asset_channel {
            return sender.0.clone();
        }
        let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::<AssetItem>();
        
        let sender = AssetSender {
            inner: sender,
        };
        self.asset_channel = Some((sender.clone(), receiver));
        sender
    }

    /// Run the job.
    /// 
    /// This method will execute the callback function with the job's assets.
    /// 
    /// Returns the result of the callback function that was used to create the job.
    pub async fn run(mut self) -> Result<T, RunJobError> {
        let mut assets = Assets {
            documents: self.documents,
            scopes: self.scopes,
            extensions: self.extensions,
            asset_channel: self.asset_channel.take(),
        };

        // Call the callback function with the assets
        (self.callback)(&mut assets).await
    }
}

/// A collection of assets that can be used by a job.
pub struct Assets {
    documents: Vec<Document>,
    scopes: Vec<Scope>,
    extensions: Arc<Vec<ActiveExtension>>,
    asset_channel: Option<(AssetSender, UnboundedReceiver<AssetItem>)>,
}

impl Assets {
    /// Refresh the available assets, ensuring that any newly sent assets are included.
    /// 
    /// This method checks if any new documents or scopes have been sent to the job since it 
    /// started running or since the last refresh. If so, it adds them to the job's assets.
    /// 
    /// Returns an iterator over the newly added assets.
    #[instrument(skip(self))]
    pub fn refresh(&mut self) -> Refresh<'_> {
        let doc_idx = self.documents.len();
        let scope_idx = self.scopes.len();

        // Check if there are any new assets to add
        if let Some((_, receiver)) = &mut self.asset_channel {
            while let Ok(item) = receiver.try_recv() {
                match item {
                    AssetItem::Document(document) => self.documents.push(document),
                    AssetItem::Scope(scope) => self.scopes.push(scope),
                }
            }
        }

        tracing::debug!("Added {} documents", self.documents.len() - doc_idx);
        tracing::debug!("Added {} scopes", self.scopes.len() - scope_idx);

        Refresh {
            assets: self,
            doc_idx,
            scope_idx,
        }
    }

    /// Get the documents available to the job.
    pub fn documents(&self) -> &[Document] {
        &self.documents
    }

    /// Get the folders available to the job.
    pub fn scopes(&self) -> &[Scope] {
        &self.scopes
    }

    /// Get the extensions available to the job.
    pub fn extensions(&self) -> &Vec<ActiveExtension> {
        &self.extensions
    }

    /// Get an asset sender for this job.
    /// 
    /// The asset sender can be used to send documents, folders, and extensions to the job's 
    /// assets. In this way, it is possible to add assets from within the job's callback function.
    /// 
    /// Note that any assets sent will not be available until `Assets::refresh` is called.
    pub fn asset_sender(&mut self) -> AssetSender {
        if let Some(sender) = &self.asset_channel {
            return sender.0.clone();
        }
        let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::<AssetItem>();
        
        let sender = AssetSender {
            inner: sender,
        };
        self.asset_channel = Some((sender.clone(), receiver));
        sender
    }

    /// Convert a document using the available extensions.
    /// 
    /// This method will try to convert the input content to the specified output type using the
    /// available extensions. If no extension is able to perform the conversion, an error will be 
    /// returned.
    pub async fn convert(&self, input: Content, output_type: Mime) -> Result<Vec<Box<dyn AsyncRead + Unpin>>, ConversionError> {
        tracing::debug!("Converting content to {}", output_type);
        let converters = self.extensions.iter()
            .filter_map(|ext| 
                if let Some(converter) = ext.converters().nth(0) {
                    Some(converter)
                } else {
                    None
                }

            )
            .collect::<Vec<_>>();

        tracing::debug!("Found {} converters", converters.len());
        for c in converters {
            match c.convert(input.clone(), output_type.clone()).await {
                Ok(result) => return Ok(result),
                Err(e) => match e {
                    ConversionError::UnsupportedMimeType(_) => continue,
                    _ => return Err(e),
                }
            }
        }

        Err(ConversionError::UnsupportedMimeType(output_type))
    }

    pub async fn chat_model(&self, model: Option<String>) -> Result<F11y<dyn ChatApi>, ChatError> {
        tracing::debug!("Getting chat model");
        // Iterate through extensions and find the specified model
        for ext in self.extensions.iter() {
            tracing::debug!("Checking extension {}", ext.name());
            if let Some(chat_client) = ext.chat_providers().nth(0) {
                tracing::debug!("Found chat model in extension {}", ext.name());
                if let Some(requested_model) = &model {
                    tracing::debug!("Looking for model {}", requested_model);
                    // TODO reconsider error variant
                    for model in chat_client.list_models().await.map_err(|e| ChatError::Provider(Box::new(e)))? {
                        if *model.id == *requested_model {
                            tracing::debug!("Found model {}", requested_model);
                            return Ok(chat_client);
                        }
                    }
                } else {
                    tracing::debug!("No model specified, returning default model");
                    return Ok(chat_client);
                }
            }
        }
        // TODO reconsider error variant
        Err(ChatError::Provider("No chat model found".into()))
    }

    pub fn embedders(&self) -> Vec<F11y<dyn Embedder>> {
        tracing::debug!("Getting embedders");
        let mut embedders = Vec::new();
        for ext in self.extensions.iter() {
            if let Some(embedder) = ext.embedders().nth(0) {
                embedders.push(embedder);
            }
        }
        embedders
    }

    pub fn chunkers(&self) -> Vec<F11y<dyn Chunker>> {
        tracing::debug!("Getting chunkers");
        let mut chunkers = Vec::new();
        for ext in self.extensions.iter() {
            if let Some(chunker) = ext.chunkers().nth(0) {
                chunkers.push(chunker);
            }
        }
        chunkers
    }

    /// Get the available prompters from the extensions.
    #[instrument(skip(self))]
    pub fn prompters(&self) -> Vec<F11y<dyn Prompter>> {
        tracing::debug!("Getting prompters");
        let mut prompters = Vec::new();
        for ext in self.extensions.iter() {
            let len_before = prompters.len();
            prompters.extend(ext.prompters());
            let len_after = prompters.len();
            tracing::debug!("Found {} prompters in extension {}", len_after - len_before, ext.name());
        }
        prompters
    }
}

/// An iterator over the assets that have been newly added to a job.
/// 
/// This struct is returned by the `Assets::refresh` method.
pub struct Refresh<'a> {
    assets: &'a Assets,
    doc_idx: usize,
    scope_idx: usize,
}

impl<'a> Refresh<'a> {
    /// Get the documents that have been newly added to the job.
    pub fn documents(self) -> impl Iterator<Item = Document> {
        self.filter_map(|item| {
            if let AssetItem::Document(doc) = item {
                Some(doc)
            } else {
                None
            }
        })
    }

    /// Get the scopes that have been newly added to the job.
    pub fn scopes(self) -> impl Iterator<Item = Scope> {
        self.filter_map(|item| {
            if let AssetItem::Scope(scope) = item {
                Some(scope)
            } else {
                None
            }
        })
    }
}

impl<'a> Iterator for Refresh<'a> {
    type Item = AssetItem;

    /// Get the next asset.
    fn next(&mut self) -> Option<Self::Item> {
        if self.doc_idx < self.assets.documents.len() {
            let item = AssetItem::Document(self.assets.documents[self.doc_idx].clone());
            self.doc_idx += 1;
            Some(item)
        } else if self.scope_idx < self.assets.scopes.len() {
            let item = AssetItem::Scope(self.assets.scopes[self.scope_idx].clone());
            self.scope_idx += 1;
            Some(item)
        } else {
            None
        }
    }
}


/// A sender for assets that can be used to send documents, folders, and extensions to a job.
#[derive(Debug, Clone)]
pub struct AssetSender {
    inner: UnboundedSender<AssetItem>,
}

impl AssetSender {
    /// Send a document to the job.
    /// 
    /// The document will be added to the assets of the job when the job is run or when the job's
    /// callback function calls `Assets::refresh`.
    pub fn send_document(&self, document: Document) -> Result<(), SendError<Document>> {
        self.inner.send(AssetItem::Document(document)).map_err(|e| match e.0 {
            AssetItem::Document(document) => SendError(document),
            _ => unreachable!(),
        })
    }

    /// Send a scope to the job.
    /// 
    /// The scope will be added to the assets of the job when the job is run or when the job's
    /// callback function calls `Assets::refresh`.
    /// 
    /// A scope is a set of documents (e.g., a folder and its subfolders).
    pub fn send_scope(&self, scope: Scope) -> Result<(), SendError<Scope>> {
        self.inner.send(AssetItem::Scope(scope)).map_err(|e| match e.0 {
            AssetItem::Scope(folder) => SendError(folder),
            _ => unreachable!(),
        })
    }
}

/// An item that can be sent to a job's assets.
pub enum AssetItem {
    Document(Document),
    Scope(Scope),
}

#[derive(Debug, Error)]
pub enum RunJobError {
    #[error("Job failed due to extension error: {0}")]
    Extension(#[from] UseExtensionError),

    #[error("Job failed due to chat error: {0}")]
    Chat(#[from] ChatError),

    #[error("Job failed due to embedding error: {0}")]
    Embedding(#[from] EmbeddingError),

    #[error("Job failed due to conversion error: {0}")]
    Conversion(#[from] ConversionError),

    #[error("Job failed due to prompt error: {0}")]
    Prompt(#[from] crate::chat::prompter::PromptError),

    #[error("Job failed due to storage error: {0}")]
    Storage(#[from] storage::Error),

    #[error("Job failed: {0}")]
    Other(Box<dyn std::error::Error + Send + Sync>),
}