mod proselint;
mod vale;
pub use proselint::ProselintEngine;
pub use vale::ValeEngine;
use crate::checker::{Diagnostic, Severity};
use anyhow::Result;
use extism::{Manifest, Plugin, Wasm};
use harper_core::{
Dialect, Document, Lrc,
linting::{LintGroup, Linter},
parsers::Markdown,
spell::FstDictionary,
};
use serde::Deserialize;
use std::path::PathBuf;
use tracing::{debug, warn};
#[async_trait::async_trait]
pub trait Engine {
fn name(&self) -> &'static str;
async fn check(&mut self, text: &str, language_id: &str) -> Result<Vec<Diagnostic>>;
fn supported_languages(&self) -> Vec<&'static str> {
vec![]
}
}
pub fn engine_supports_language(engine: &(dyn Engine + Send), lang_tag: &str) -> bool {
let supported = engine.supported_languages();
if supported.is_empty() {
return true;
}
let primary = lang_tag.split('-').next().unwrap_or(lang_tag);
supported.iter().any(|s| s.eq_ignore_ascii_case(primary))
}
fn char_to_byte_table(text: &str) -> Vec<u32> {
#[allow(clippy::cast_possible_truncation)]
let mut table: Vec<u32> = text.char_indices().map(|(b, _)| b as u32).collect();
#[allow(clippy::cast_possible_truncation)]
table.push(text.len() as u32);
table
}
fn utf16_to_byte_table(text: &str) -> Vec<u32> {
let mut table: Vec<u32> = Vec::with_capacity(text.len() + 1);
for (byte_idx, ch) in text.char_indices() {
#[allow(clippy::cast_possible_truncation)]
let b = byte_idx as u32;
for _ in 0..ch.len_utf16() {
table.push(b);
}
}
#[allow(clippy::cast_possible_truncation)]
table.push(text.len() as u32);
table
}
fn lookup_offset(table: &[u32], idx: usize) -> u32 {
table
.get(idx)
.copied()
.unwrap_or_else(|| table.last().copied().unwrap_or(0))
}
pub struct HarperEngine {
linter: LintGroup,
dict: Lrc<FstDictionary>,
}
impl HarperEngine {
#[must_use]
pub fn new(config: &crate::config::HarperConfig) -> Self {
let dialect = match config.dialect.as_str() {
"British" => Dialect::British,
"Canadian" => Dialect::Canadian,
"Australian" => Dialect::Australian,
_ => Dialect::American,
};
let dict = FstDictionary::curated();
let mut linter = LintGroup::new_curated(dict.clone(), dialect);
for (rule, enabled) in &config.linters {
linter.config.set_rule_enabled(rule, *enabled);
}
Self { linter, dict }
}
}
#[async_trait::async_trait]
impl Engine for HarperEngine {
fn name(&self) -> &'static str {
"harper"
}
fn supported_languages(&self) -> Vec<&'static str> {
vec!["en"]
}
async fn check(&mut self, text: &str, _language_id: &str) -> Result<Vec<Diagnostic>> {
let document = Document::new(text, &Markdown::default(), self.dict.as_ref());
let lints = self.linter.lint(&document);
let char_to_byte = char_to_byte_table(text);
let diagnostics = lints
.into_iter()
.map(|lint| {
let suggestions = lint
.suggestions
.into_iter()
.map(|s| match s {
harper_core::linting::Suggestion::ReplaceWith(chars) => {
chars.into_iter().collect::<String>()
}
harper_core::linting::Suggestion::InsertAfter(chars) => {
let content: String = chars.into_iter().collect();
format!("Insert \"{content}\"")
}
harper_core::linting::Suggestion::Remove => String::new(),
})
.collect();
Diagnostic {
start_byte: lookup_offset(&char_to_byte, lint.span.start),
end_byte: lookup_offset(&char_to_byte, lint.span.end),
message: lint.message,
suggestions,
rule_id: format!("harper.{:?}", lint.lint_kind),
severity: Severity::Warning as i32,
unified_id: String::new(), confidence: 0.8,
}
})
.collect();
Ok(diagnostics)
}
}
pub struct LanguageToolEngine {
url: String,
level: String,
mother_tongue: Option<String>,
disabled_rules: Vec<String>,
enabled_rules: Vec<String>,
disabled_categories: Vec<String>,
enabled_categories: Vec<String>,
client: reqwest::Client,
}
#[derive(Deserialize)]
struct LTResponse {
matches: Vec<LTMatch>,
}
#[derive(Deserialize)]
struct LTMatch {
message: String,
offset: usize,
length: usize,
replacements: Vec<LTReplacement>,
rule: LTRule,
}
#[derive(Deserialize)]
struct LTReplacement {
value: String,
}
#[derive(Deserialize)]
#[serde(rename_all = "camelCase")]
struct LTRule {
id: String,
issue_type: String,
}
impl LanguageToolEngine {
#[must_use]
pub fn new(config: &crate::config::LanguageToolConfig) -> Self {
let client = reqwest::Client::builder()
.connect_timeout(std::time::Duration::from_secs(3))
.timeout(std::time::Duration::from_secs(10))
.build()
.unwrap_or_default();
Self {
url: config.url.clone(),
level: config.level.clone(),
mother_tongue: config.mother_tongue.clone(),
disabled_rules: config.disabled_rules.clone(),
enabled_rules: config.enabled_rules.clone(),
disabled_categories: config.disabled_categories.clone(),
enabled_categories: config.enabled_categories.clone(),
client,
}
}
}
#[allow(clippy::too_many_lines, clippy::cast_possible_truncation)]
#[async_trait::async_trait]
impl Engine for LanguageToolEngine {
fn name(&self) -> &'static str {
"languagetool"
}
async fn check(&mut self, text: &str, language_id: &str) -> Result<Vec<Diagnostic>> {
let url = format!("{}/v2/check", self.url);
let lt_lang = language_id;
debug!(
url = %url,
language = lt_lang,
text_len = text.len(),
"LanguageTool request"
);
let mut form_params: Vec<(&str, String)> = vec![
("text", text.to_string()),
("language", lt_lang.to_string()),
];
if self.level != "default" {
form_params.push(("level", self.level.clone()));
}
if let Some(ref mt) = self.mother_tongue {
form_params.push(("motherTongue", mt.clone()));
}
if !self.disabled_rules.is_empty() {
form_params.push(("disabledRules", self.disabled_rules.join(",")));
}
if !self.enabled_rules.is_empty() {
form_params.push(("enabledRules", self.enabled_rules.join(",")));
}
if !self.disabled_categories.is_empty() {
form_params.push(("disabledCategories", self.disabled_categories.join(",")));
}
if !self.enabled_categories.is_empty() {
form_params.push(("enabledCategories", self.enabled_categories.join(",")));
}
let request_start = std::time::Instant::now();
let response = match self.client.post(&url).form(&form_params).send().await {
Ok(r) => {
let status = r.status();
debug!(
status = %status,
elapsed_ms = request_start.elapsed().as_millis() as u64,
"LanguageTool HTTP response"
);
if !status.is_success() {
let body = r.text().await.unwrap_or_default();
warn!(
status = %status,
body = %body,
"LanguageTool returned non-200"
);
return Err(anyhow::anyhow!("LanguageTool HTTP {status}: {body}"));
}
r
}
Err(e) => {
warn!(
elapsed_ms = request_start.elapsed().as_millis() as u64,
"LanguageTool connection error: {e}"
);
return Err(anyhow::anyhow!("LanguageTool connection error: {e}"));
}
};
let res = match response.json::<LTResponse>().await {
Ok(r) => r,
Err(e) => {
warn!("LanguageTool JSON parse error: {e}");
return Err(anyhow::anyhow!("LanguageTool JSON parse error: {e}"));
}
};
debug!(
matches = res.matches.len(),
elapsed_ms = request_start.elapsed().as_millis() as u64,
"LanguageTool check complete"
);
let utf16_to_byte = utf16_to_byte_table(text);
let diagnostics = res
.matches
.into_iter()
.map(|m| {
let severity = match m.rule.issue_type.as_str() {
"misspelling" => Severity::Error,
"typographical" => Severity::Warning,
_ => Severity::Information,
};
Diagnostic {
start_byte: lookup_offset(&utf16_to_byte, m.offset),
end_byte: lookup_offset(&utf16_to_byte, m.offset + m.length),
message: m.message,
suggestions: m.replacements.into_iter().map(|r| r.value).collect(),
rule_id: format!("languagetool.{}", m.rule.id),
severity: severity as i32,
unified_id: String::new(), confidence: 0.8,
}
})
.collect();
Ok(diagnostics)
}
}
pub struct ExternalEngine {
name: String,
command: String,
args: Vec<String>,
}
impl ExternalEngine {
#[must_use]
pub const fn new(name: String, command: String, args: Vec<String>) -> Self {
Self {
name,
command,
args,
}
}
}
#[derive(serde::Serialize)]
struct ExternalRequest<'a> {
text: &'a str,
language_id: &'a str,
}
#[derive(Deserialize)]
struct ExternalDiagnostic {
start_byte: u32,
end_byte: u32,
message: String,
#[serde(default)]
suggestions: Vec<String>,
#[serde(default)]
rule_id: String,
#[serde(default = "default_severity_value")]
severity: i32,
#[serde(default)]
confidence: f32,
}
const fn default_severity_value() -> i32 {
Severity::Warning as i32
}
#[async_trait::async_trait]
impl Engine for ExternalEngine {
fn name(&self) -> &'static str {
"external"
}
async fn check(&mut self, text: &str, language_id: &str) -> Result<Vec<Diagnostic>> {
use tokio::process::Command;
let request = ExternalRequest { text, language_id };
let input = serde_json::to_string(&request)?;
let output = match Command::new(&self.command)
.args(&self.args)
.stdin(std::process::Stdio::piped())
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()
{
Ok(mut child) => {
use tokio::io::AsyncWriteExt;
if let Some(mut stdin) = child.stdin.take() {
let _ = stdin.write_all(input.as_bytes()).await;
let _ = stdin.shutdown().await;
}
child.wait_with_output().await?
}
Err(e) => {
warn!(provider = %self.name, "Failed to spawn external provider: {e}");
return Ok(vec![]);
}
};
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
warn!(
provider = %self.name,
status = %output.status,
stderr = stderr.trim(),
"External provider exited with error"
);
return Ok(vec![]);
}
let stdout = String::from_utf8_lossy(&output.stdout);
let ext_diagnostics: Vec<ExternalDiagnostic> = match serde_json::from_str(&stdout) {
Ok(d) => d,
Err(e) => {
warn!(provider = %self.name, "Failed to parse external provider output: {e}");
return Ok(vec![]);
}
};
let diagnostics = ext_diagnostics
.into_iter()
.map(|ed| {
let rule_id = if ed.rule_id.is_empty() {
format!("external.{}", self.name)
} else {
format!("external.{}.{}", self.name, ed.rule_id)
};
Diagnostic {
start_byte: ed.start_byte,
end_byte: ed.end_byte,
message: ed.message,
suggestions: ed.suggestions,
rule_id,
severity: ed.severity,
unified_id: String::new(),
confidence: if ed.confidence > 0.0 {
ed.confidence
} else {
0.7
},
}
})
.collect();
Ok(diagnostics)
}
}
pub struct WasmEngine {
name: String,
plugin: Plugin,
}
unsafe impl Send for WasmEngine {}
impl WasmEngine {
pub fn new(name: String, wasm_path: PathBuf) -> Result<Self> {
let wasm = Wasm::file(wasm_path);
let manifest = Manifest::new([wasm]);
let plugin = Plugin::new(&manifest, [], true)?;
Ok(Self { name, plugin })
}
pub fn from_bytes(name: String, wasm_bytes: &[u8]) -> Result<Self> {
let wasm = Wasm::data(wasm_bytes.to_vec());
let manifest = Manifest::new([wasm]);
let plugin = Plugin::new(&manifest, [], true)?;
Ok(Self { name, plugin })
}
}
#[async_trait::async_trait]
impl Engine for WasmEngine {
fn name(&self) -> &'static str {
"wasm"
}
async fn check(&mut self, text: &str, language_id: &str) -> Result<Vec<Diagnostic>> {
let request = serde_json::json!({
"text": text,
"language_id": language_id,
});
let input = request.to_string();
let output = match self.plugin.call::<&str, &str>("check", &input) {
Ok(result) => result.to_string(),
Err(e) => {
warn!(plugin = %self.name, "WASM plugin call failed: {e}");
return Ok(vec![]);
}
};
let ext_diagnostics: Vec<ExternalDiagnostic> = match serde_json::from_str(&output) {
Ok(d) => d,
Err(e) => {
warn!(plugin = %self.name, "Failed to parse WASM plugin output: {e}");
return Ok(vec![]);
}
};
let diagnostics = ext_diagnostics
.into_iter()
.map(|ed| {
let rule_id = if ed.rule_id.is_empty() {
format!("wasm.{}", self.name)
} else {
format!("wasm.{}.{}", self.name, ed.rule_id)
};
Diagnostic {
start_byte: ed.start_byte,
end_byte: ed.end_byte,
message: ed.message,
suggestions: ed.suggestions,
rule_id,
severity: ed.severity,
unified_id: String::new(),
confidence: if ed.confidence > 0.0 {
ed.confidence
} else {
0.7
},
}
})
.collect();
Ok(diagnostics)
}
}
#[must_use]
pub fn discover_wasm_plugins(plugin_dir: &std::path::Path) -> Vec<(String, PathBuf)> {
let Ok(entries) = std::fs::read_dir(plugin_dir) else {
return Vec::new();
};
entries
.filter_map(|entry| {
let entry = entry.ok()?;
let path = entry.path();
if path.extension().is_some_and(|e| e == "wasm") {
let name = path
.file_stem()
.map(|s| s.to_string_lossy().to_string())
.unwrap_or_default();
Some((name, path))
} else {
None
}
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn char_to_byte_handles_multibyte() {
let table = char_to_byte_table("a—b");
assert_eq!(table, vec![0, 1, 4, 5]); assert_eq!(lookup_offset(&table, 2), 4); assert_eq!(lookup_offset(&table, 3), 5); assert_eq!(lookup_offset(&table, 99), 5); }
#[test]
fn utf16_to_byte_handles_astral() {
let table = utf16_to_byte_table("a😀b");
assert_eq!(table, vec![0, 1, 1, 5, 6]);
assert_eq!(lookup_offset(&table, 3), 5); }
#[test]
fn em_dash_does_not_shift_byte_offsets() {
let table = char_to_byte_table("a—b");
assert_eq!(lookup_offset(&table, 2), 4);
assert_eq!(lookup_offset(&table, 3), 5);
}
#[tokio::test]
async fn test_harper_engine() -> Result<()> {
let mut engine = HarperEngine::new(&crate::config::HarperConfig::default());
let text = "This is an test.";
let diagnostics = engine.check(text, "en-US").await?;
assert!(!diagnostics.is_empty());
Ok(())
}
#[tokio::test]
async fn harper_offsets_are_bytes_after_em_dash() -> Result<()> {
let mut engine = HarperEngine::new(&crate::config::HarperConfig::default());
let text = "Some prose — this is an test.";
let diagnostics = engine.check(text, "en-US").await?;
assert!(!diagnostics.is_empty(), "Harper should flag 'an test'");
for d in &diagnostics {
let (s, e) = (d.start_byte as usize, d.end_byte as usize);
assert!(text.is_char_boundary(s), "start {s} not a char boundary");
assert!(text.is_char_boundary(e), "end {e} not a char boundary");
assert!(s <= e && e <= text.len(), "span ({s},{e}) out of range");
}
Ok(())
}
#[tokio::test]
async fn external_engine_with_echo() -> Result<()> {
let mut engine = ExternalEngine::new(
"test-provider".to_string(),
"sh".to_string(),
vec![
"-c".to_string(),
r#"cat > /dev/null; echo '[{"start_byte":0,"end_byte":4,"message":"test issue","suggestions":["fix"],"rule_id":"test.rule","severity":2}]'"#.to_string(),
],
);
let diagnostics = engine.check("some text", "markdown").await?;
assert_eq!(diagnostics.len(), 1);
assert_eq!(diagnostics[0].message, "test issue");
assert_eq!(diagnostics[0].rule_id, "external.test-provider.test.rule");
assert_eq!(diagnostics[0].suggestions, vec!["fix"]);
assert_eq!(diagnostics[0].start_byte, 0);
assert_eq!(diagnostics[0].end_byte, 4);
Ok(())
}
#[tokio::test]
async fn external_engine_missing_binary() -> Result<()> {
let mut engine = ExternalEngine::new(
"nonexistent".to_string(),
"/nonexistent/binary".to_string(),
vec![],
);
let diagnostics = engine.check("text", "markdown").await?;
assert!(diagnostics.is_empty());
Ok(())
}
#[tokio::test]
async fn external_engine_bad_json_output() -> Result<()> {
let mut engine = ExternalEngine::new(
"bad-json".to_string(),
"echo".to_string(),
vec!["not json".to_string()],
);
let diagnostics = engine.check("text", "markdown").await?;
assert!(diagnostics.is_empty());
Ok(())
}
#[test]
fn wasm_engine_invalid_bytes_returns_error() {
let result = WasmEngine::from_bytes("bad-plugin".to_string(), b"not a wasm file");
assert!(result.is_err());
}
#[test]
fn wasm_engine_missing_file_returns_error() {
let result = WasmEngine::new(
"missing".to_string(),
PathBuf::from("/nonexistent/plugin.wasm"),
);
assert!(result.is_err());
}
#[test]
fn discover_wasm_plugins_empty_dir() {
let dir = std::env::temp_dir().join("lang_check_test_wasm_empty");
let _ = std::fs::remove_dir_all(&dir);
std::fs::create_dir_all(&dir).unwrap();
let plugins = discover_wasm_plugins(&dir);
assert!(plugins.is_empty());
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn discover_wasm_plugins_finds_wasm_files() {
let dir = std::env::temp_dir().join("lang_check_test_wasm_discover");
let _ = std::fs::remove_dir_all(&dir);
std::fs::create_dir_all(&dir).unwrap();
std::fs::write(dir.join("checker.wasm"), b"fake").unwrap();
std::fs::write(dir.join("linter.wasm"), b"fake").unwrap();
std::fs::write(dir.join("readme.txt"), b"not a plugin").unwrap();
let mut plugins = discover_wasm_plugins(&dir);
plugins.sort_by(|a, b| a.0.cmp(&b.0));
assert_eq!(plugins.len(), 2);
assert_eq!(plugins[0].0, "checker");
assert_eq!(plugins[1].0, "linter");
assert!(plugins[0].1.ends_with("checker.wasm"));
assert!(plugins[1].1.ends_with("linter.wasm"));
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn discover_wasm_plugins_nonexistent_dir() {
let plugins = discover_wasm_plugins(std::path::Path::new("/nonexistent/dir"));
assert!(plugins.is_empty());
}
#[tokio::test]
#[ignore]
async fn lt_engine_live() -> Result<()> {
let _ = tracing_subscriber::fmt()
.with_env_filter("debug")
.with_writer(std::io::stderr)
.with_target(false)
.try_init();
let mut engine = LanguageToolEngine::new(&crate::config::LanguageToolConfig::default());
let text = "This is a sentnce with erors.";
let diagnostics = engine.check(text, "markdown").await?;
println!("LT returned {} diagnostics:", diagnostics.len());
for d in &diagnostics {
println!(
" [{}-{}] {} (rule: {}, suggestions: {:?})",
d.start_byte, d.end_byte, d.message, d.rule_id, d.suggestions
);
}
assert!(
diagnostics.len() >= 2,
"Expected at least 2 spelling errors, got {}",
diagnostics.len()
);
Ok(())
}
#[test]
fn lt_response_deserializes_camel_case() {
let json = r#"{
"matches": [{
"message": "Possible spelling mistake found.",
"offset": 10,
"length": 7,
"replacements": [{"value": "sentence"}],
"rule": {
"id": "MORFOLOGIK_RULE_EN_US",
"description": "Possible spelling mistake",
"issueType": "misspelling",
"category": {"id": "TYPOS", "name": "Possible Typo"}
}
}]
}"#;
let res: LTResponse = serde_json::from_str(json).unwrap();
assert_eq!(res.matches.len(), 1);
assert_eq!(res.matches[0].rule.id, "MORFOLOGIK_RULE_EN_US");
assert_eq!(res.matches[0].rule.issue_type, "misspelling");
assert_eq!(res.matches[0].offset, 10);
assert_eq!(res.matches[0].length, 7);
assert_eq!(res.matches[0].replacements[0].value, "sentence");
}
}