use crate::Result;
use regex::Regex;
use std::collections::HashMap;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum POSTag {
Noun,
NounPlural,
ProperNoun,
ProperNounPlural,
Verb,
VerbPast,
VerbGerund,
Verb3rdSing,
Adjective,
Adverb,
Preposition,
Determiner,
Pronoun,
Conjunction,
Punctuation,
Number,
Unknown,
}
impl POSTag {
pub fn penn_tag(&self) -> &str {
match self {
POSTag::Noun => "NN",
POSTag::NounPlural => "NNS",
POSTag::ProperNoun => "NNP",
POSTag::ProperNounPlural => "NNPS",
POSTag::Verb => "VB",
POSTag::VerbPast => "VBD",
POSTag::VerbGerund => "VBG",
POSTag::Verb3rdSing => "VBZ",
POSTag::Adjective => "JJ",
POSTag::Adverb => "RB",
POSTag::Preposition => "IN",
POSTag::Determiner => "DT",
POSTag::Pronoun => "PRP",
POSTag::Conjunction => "CC",
POSTag::Punctuation => ".",
POSTag::Number => "CD",
POSTag::Unknown => "UNK",
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum DependencyRelation {
Subject,
DirectObject,
IndirectObject,
Modifier,
Determiner,
PrepositionalModifier,
Conjunction,
Complement,
Root,
Unknown,
}
#[derive(Debug, Clone)]
pub struct Token {
pub text: String,
pub position: usize,
pub pos: POSTag,
pub lemma: String,
}
#[derive(Debug, Clone)]
pub struct Dependency {
pub head: usize,
pub dependent: usize,
pub relation: DependencyRelation,
}
#[derive(Debug, Clone)]
pub struct NounPhrase {
pub tokens: Vec<Token>,
pub head_idx: usize,
pub text: String,
}
#[derive(Debug, Clone)]
pub struct SyntaxAnalyzerConfig {
pub enable_pos_tagging: bool,
pub enable_dependency_parsing: bool,
pub enable_phrase_extraction: bool,
}
impl Default for SyntaxAnalyzerConfig {
fn default() -> Self {
Self {
enable_pos_tagging: true,
enable_dependency_parsing: true,
enable_phrase_extraction: true,
}
}
}
pub struct SyntaxAnalyzer {
#[allow(dead_code)] config: SyntaxAnalyzerConfig,
common_nouns: HashMap<String, POSTag>,
common_verbs: HashMap<String, POSTag>,
common_adjectives: HashMap<String, POSTag>,
common_adverbs: HashMap<String, POSTag>,
prepositions: HashMap<String, POSTag>,
determiners: HashMap<String, POSTag>,
pronouns: HashMap<String, POSTag>,
conjunctions: HashMap<String, POSTag>,
}
impl SyntaxAnalyzer {
pub fn new(config: SyntaxAnalyzerConfig) -> Self {
Self {
config,
common_nouns: Self::build_noun_dict(),
common_verbs: Self::build_verb_dict(),
common_adjectives: Self::build_adjective_dict(),
common_adverbs: Self::build_adverb_dict(),
prepositions: Self::build_preposition_dict(),
determiners: Self::build_determiner_dict(),
pronouns: Self::build_pronoun_dict(),
conjunctions: Self::build_conjunction_dict(),
}
}
fn tokenize(&self, text: &str) -> Vec<(String, usize)> {
let mut tokens = Vec::new();
let mut current_word = String::new();
let mut word_start = 0;
for (i, ch) in text.chars().enumerate() {
if ch.is_alphanumeric() || ch == '\'' || ch == '-' {
if current_word.is_empty() {
word_start = i;
}
current_word.push(ch);
} else {
if !current_word.is_empty() {
tokens.push((current_word.clone(), word_start));
current_word.clear();
}
if !ch.is_whitespace() {
tokens.push((ch.to_string(), i));
}
}
}
if !current_word.is_empty() {
tokens.push((current_word, word_start));
}
tokens
}
pub fn pos_tag(&self, text: &str) -> Result<Vec<Token>> {
let raw_tokens = self.tokenize(text);
let mut tokens = Vec::new();
for (word, position) in raw_tokens {
let pos = self.tag_word(&word);
let lemma = self.lemmatize(&word, &pos);
tokens.push(Token {
text: word,
position,
pos,
lemma,
});
}
Ok(tokens)
}
fn tag_word(&self, word: &str) -> POSTag {
let lower = word.to_lowercase();
if word.chars().all(|c| c.is_ascii_punctuation()) {
return POSTag::Punctuation;
}
if word.chars().all(|c| c.is_ascii_digit()) {
return POSTag::Number;
}
if let Some(pos) = self.determiners.get(&lower) {
return pos.clone();
}
if let Some(pos) = self.pronouns.get(&lower) {
return pos.clone();
}
if let Some(pos) = self.prepositions.get(&lower) {
return pos.clone();
}
if let Some(pos) = self.conjunctions.get(&lower) {
return pos.clone();
}
if let Some(pos) = self.common_adverbs.get(&lower) {
return pos.clone();
}
if let Some(pos) = self.common_verbs.get(&lower) {
return pos.clone();
}
if let Some(pos) = self.common_adjectives.get(&lower) {
return pos.clone();
}
if let Some(pos) = self.common_nouns.get(&lower) {
return pos.clone();
}
if word
.chars()
.next()
.expect("non-empty string")
.is_uppercase()
{
return POSTag::ProperNoun;
}
if lower.ends_with("ing") {
return POSTag::VerbGerund;
}
if lower.ends_with("ed") {
return POSTag::VerbPast;
}
if lower.ends_with('s') && !lower.ends_with("ss") {
return POSTag::NounPlural;
}
if lower.ends_with("ive") || lower.ends_with("ous") || lower.ends_with("ful") {
return POSTag::Adjective;
}
if lower.ends_with("ly") {
return POSTag::Adverb;
}
POSTag::Noun
}
fn lemmatize(&self, word: &str, pos: &POSTag) -> String {
let lower = word.to_lowercase();
match pos {
POSTag::NounPlural => {
if lower.ends_with("ies") {
return format!("{}y", &lower[..lower.len() - 3]);
}
if lower.ends_with('s') && !lower.ends_with("ss") {
return lower[..lower.len() - 1].to_string();
}
lower
},
POSTag::VerbPast | POSTag::Verb3rdSing => {
if lower.ends_with("ed") {
return lower[..lower.len() - 2].to_string();
}
if lower.ends_with('s') {
return lower[..lower.len() - 1].to_string();
}
lower
},
POSTag::VerbGerund => {
if lower.ends_with("ing") {
return lower[..lower.len() - 3].to_string();
}
lower
},
_ => lower,
}
}
pub fn parse_dependencies(&self, tokens: &[Token]) -> Result<Vec<Dependency>> {
let mut dependencies = Vec::new();
if tokens.is_empty() {
return Ok(dependencies);
}
let root_idx = tokens
.iter()
.position(|t| matches!(t.pos, POSTag::Verb | POSTag::VerbPast | POSTag::Verb3rdSing))
.unwrap_or(0);
#[allow(clippy::needless_range_loop)]
for i in 0..root_idx {
if matches!(
tokens[i].pos,
POSTag::Noun | POSTag::ProperNoun | POSTag::Pronoun
) {
dependencies.push(Dependency {
head: root_idx,
dependent: i,
relation: DependencyRelation::Subject,
});
break;
}
}
#[allow(clippy::needless_range_loop)]
for i in (root_idx + 1)..tokens.len() {
if matches!(tokens[i].pos, POSTag::Noun | POSTag::ProperNoun) {
dependencies.push(Dependency {
head: root_idx,
dependent: i,
relation: DependencyRelation::DirectObject,
});
break;
}
}
for i in 0..tokens.len() {
match tokens[i].pos {
POSTag::Adjective => {
if let Some(noun_idx) = tokens[i + 1..]
.iter()
.position(|t| matches!(t.pos, POSTag::Noun | POSTag::ProperNoun))
{
dependencies.push(Dependency {
head: i + 1 + noun_idx,
dependent: i,
relation: DependencyRelation::Modifier,
});
}
},
POSTag::Adverb => {
let verb_idx = tokens.iter().position(|t| {
matches!(t.pos, POSTag::Verb | POSTag::VerbPast | POSTag::Verb3rdSing)
});
if let Some(v_idx) = verb_idx {
dependencies.push(Dependency {
head: v_idx,
dependent: i,
relation: DependencyRelation::Modifier,
});
}
},
POSTag::Determiner => {
if let Some(noun_idx) = tokens[i + 1..]
.iter()
.position(|t| matches!(t.pos, POSTag::Noun | POSTag::ProperNoun))
{
dependencies.push(Dependency {
head: i + 1 + noun_idx,
dependent: i,
relation: DependencyRelation::Determiner,
});
}
},
_ => {},
}
}
Ok(dependencies)
}
pub fn extract_noun_phrases(&self, tokens: &[Token]) -> Result<Vec<NounPhrase>> {
let mut phrases = Vec::new();
let mut current_phrase: Vec<Token> = Vec::new();
let mut head_idx = 0;
for token in tokens {
match token.pos {
POSTag::Determiner | POSTag::Adjective => {
current_phrase.push(token.clone());
},
POSTag::Noun
| POSTag::ProperNoun
| POSTag::NounPlural
| POSTag::ProperNounPlural => {
head_idx = current_phrase.len();
current_phrase.push(token.clone());
},
_ => {
if !current_phrase.is_empty() {
let text = current_phrase
.iter()
.map(|t| t.text.as_str())
.collect::<Vec<_>>()
.join(" ");
phrases.push(NounPhrase {
tokens: current_phrase.clone(),
head_idx,
text,
});
current_phrase.clear();
head_idx = 0;
}
},
}
}
if !current_phrase.is_empty() {
let text = current_phrase
.iter()
.map(|t| t.text.as_str())
.collect::<Vec<_>>()
.join(" ");
phrases.push(NounPhrase {
tokens: current_phrase,
head_idx,
text,
});
}
Ok(phrases)
}
pub fn segment_sentences(&self, text: &str) -> Vec<String> {
let sentence_regex = Regex::new(r"[.!?]+\s+").expect("static regex literal");
sentence_regex
.split(text)
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect()
}
fn build_noun_dict() -> HashMap<String, POSTag> {
let nouns = vec![
"time",
"person",
"year",
"way",
"day",
"thing",
"man",
"world",
"life",
"hand",
"part",
"child",
"eye",
"woman",
"place",
"work",
"week",
"case",
"point",
"government",
"company",
"number",
"group",
"problem",
"fact",
];
nouns
.into_iter()
.map(|s| (s.to_string(), POSTag::Noun))
.collect()
}
fn build_verb_dict() -> HashMap<String, POSTag> {
let verbs = vec![
"be", "have", "do", "say", "get", "make", "go", "know", "take", "see", "come", "think",
"look", "want", "give", "use", "find", "tell", "ask", "work", "seem", "feel", "try",
"leave", "call",
];
verbs
.into_iter()
.map(|s| (s.to_string(), POSTag::Verb))
.collect()
}
fn build_adjective_dict() -> HashMap<String, POSTag> {
let adjectives = vec![
"good",
"new",
"first",
"last",
"long",
"great",
"little",
"own",
"other",
"old",
"right",
"big",
"high",
"different",
"small",
"large",
"next",
"early",
"young",
"important",
"few",
"public",
"bad",
"same",
"able",
];
adjectives
.into_iter()
.map(|s| (s.to_string(), POSTag::Adjective))
.collect()
}
fn build_adverb_dict() -> HashMap<String, POSTag> {
let adverbs = vec![
"not", "so", "out", "up", "now", "only", "just", "more", "also", "very", "well",
"back", "there", "even", "still", "too", "here", "then", "always", "never", "often",
"quite", "really", "almost", "again",
];
adverbs
.into_iter()
.map(|s| (s.to_string(), POSTag::Adverb))
.collect()
}
fn build_preposition_dict() -> HashMap<String, POSTag> {
let prepositions = vec![
"of", "in", "to", "for", "with", "on", "at", "from", "by", "about", "into", "through",
"during", "before", "after", "above", "below", "between", "under", "since", "without",
"within", "along", "among", "across",
];
prepositions
.into_iter()
.map(|s| (s.to_string(), POSTag::Preposition))
.collect()
}
fn build_determiner_dict() -> HashMap<String, POSTag> {
let determiners = vec![
"the", "a", "an", "this", "that", "these", "those", "my", "your", "his", "her", "its",
"our", "their", "all", "both", "each", "every", "some", "any", "no", "another", "such",
"what", "which",
];
determiners
.into_iter()
.map(|s| (s.to_string(), POSTag::Determiner))
.collect()
}
fn build_pronoun_dict() -> HashMap<String, POSTag> {
let pronouns = vec![
"i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them", "who",
"whom", "what", "which", "this", "that",
];
pronouns
.into_iter()
.map(|s| (s.to_string(), POSTag::Pronoun))
.collect()
}
fn build_conjunction_dict() -> HashMap<String, POSTag> {
let conjunctions = vec![
"and", "or", "but", "nor", "yet", "so", "for", "because", "although", "though",
"while", "if", "unless", "until", "when", "where",
];
conjunctions
.into_iter()
.map(|s| (s.to_string(), POSTag::Conjunction))
.collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pos_tagging() {
let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
let text = "The good brown fox jumps over the lazy dog.";
let tokens = analyzer.pos_tag(text).unwrap();
assert!(!tokens.is_empty());
assert_eq!(tokens[0].pos, POSTag::Determiner); assert_eq!(tokens[1].pos, POSTag::Adjective); assert!(matches!(tokens[3].pos, POSTag::Noun | POSTag::ProperNoun)); assert!(tokens.iter().any(|t| t.text == "jumps"));
}
#[test]
fn test_lemmatization() {
let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
assert_eq!(analyzer.lemmatize("running", &POSTag::VerbGerund), "runn");
assert_eq!(analyzer.lemmatize("cats", &POSTag::NounPlural), "cat");
assert_eq!(analyzer.lemmatize("jumped", &POSTag::VerbPast), "jump");
}
#[test]
fn test_noun_phrase_extraction() {
let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
let text = "The quick brown fox";
let tokens = analyzer.pos_tag(text).unwrap();
let phrases = analyzer.extract_noun_phrases(&tokens).unwrap();
assert_eq!(phrases.len(), 1);
assert_eq!(phrases[0].text, "The quick brown fox");
}
#[test]
fn test_dependency_parsing() {
let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
let text = "The cat chased the mouse";
let tokens = analyzer.pos_tag(text).unwrap();
let deps = analyzer.parse_dependencies(&tokens).unwrap();
assert!(!deps.is_empty());
let has_subject = deps
.iter()
.any(|d| matches!(d.relation, DependencyRelation::Subject));
assert!(has_subject, "Should have subject dependency");
}
#[test]
fn test_sentence_segmentation() {
let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
let text = "This is sentence one. This is sentence two! And sentence three?";
let sentences = analyzer.segment_sentences(text);
assert_eq!(sentences.len(), 3);
assert!(sentences[0].contains("sentence one"));
assert!(sentences[1].contains("sentence two"));
assert!(sentences[2].contains("sentence three"));
}
#[test]
fn test_tokenization() {
let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
let text = "Hello, world!";
let tokens = analyzer.tokenize(text);
assert_eq!(tokens.len(), 4); assert_eq!(tokens[0].0, "Hello");
assert_eq!(tokens[1].0, ",");
}
#[test]
fn test_proper_noun_detection() {
let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
let text = "John Smith lives in New York";
let tokens = analyzer.pos_tag(text).unwrap();
let proper_nouns: Vec<_> = tokens
.iter()
.filter(|t| matches!(t.pos, POSTag::ProperNoun))
.collect();
assert!(!proper_nouns.is_empty());
}
}