llm-text 0.1.1

A Rust library for processing text for LLM consumption
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
use std::{collections::HashSet, str::FromStr};

use linkify::{LinkFinder, LinkKind};
use url::Url;

pub fn extract_urls<T: AsRef<str>>(input: T) -> Vec<Url> {
    let mut unique_strs = HashSet::new();

    LinkFinder::new()
        .kinds(&[LinkKind::Url])
        .links(input.as_ref())
        // Deduplicate on the cheap string representation first
        .filter(|link| unique_strs.insert(link.as_str().to_string()))
        // Only parse the expensive Url for unique entries
        .filter_map(|link| Url::from_str(link.as_str()).ok())
        .collect()
}