1use crate::Result;
14use regex::Regex;
15use std::collections::HashMap;
16
17#[derive(Debug, Clone, PartialEq, Eq, Hash)]
19pub enum POSTag {
20 Noun,
22 NounPlural,
24 ProperNoun,
26 ProperNounPlural,
28 Verb,
30 VerbPast,
32 VerbGerund,
34 Verb3rdSing,
36 Adjective,
38 Adverb,
40 Preposition,
42 Determiner,
44 Pronoun,
46 Conjunction,
48 Punctuation,
50 Number,
52 Unknown,
54}
55
56impl POSTag {
57 pub fn penn_tag(&self) -> &str {
59 match self {
60 POSTag::Noun => "NN",
61 POSTag::NounPlural => "NNS",
62 POSTag::ProperNoun => "NNP",
63 POSTag::ProperNounPlural => "NNPS",
64 POSTag::Verb => "VB",
65 POSTag::VerbPast => "VBD",
66 POSTag::VerbGerund => "VBG",
67 POSTag::Verb3rdSing => "VBZ",
68 POSTag::Adjective => "JJ",
69 POSTag::Adverb => "RB",
70 POSTag::Preposition => "IN",
71 POSTag::Determiner => "DT",
72 POSTag::Pronoun => "PRP",
73 POSTag::Conjunction => "CC",
74 POSTag::Punctuation => ".",
75 POSTag::Number => "CD",
76 POSTag::Unknown => "UNK",
77 }
78 }
79}
80
81#[derive(Debug, Clone, PartialEq, Eq)]
83pub enum DependencyRelation {
84 Subject,
86 DirectObject,
88 IndirectObject,
90 Modifier,
92 Determiner,
94 PrepositionalModifier,
96 Conjunction,
98 Complement,
100 Root,
102 Unknown,
104}
105
106#[derive(Debug, Clone)]
108pub struct Token {
109 pub text: String,
111 pub position: usize,
113 pub pos: POSTag,
115 pub lemma: String,
117}
118
119#[derive(Debug, Clone)]
121pub struct Dependency {
122 pub head: usize,
124 pub dependent: usize,
126 pub relation: DependencyRelation,
128}
129
130#[derive(Debug, Clone)]
132pub struct NounPhrase {
133 pub tokens: Vec<Token>,
135 pub head_idx: usize,
137 pub text: String,
139}
140
141#[derive(Debug, Clone)]
143pub struct SyntaxAnalyzerConfig {
144 pub enable_pos_tagging: bool,
146 pub enable_dependency_parsing: bool,
148 pub enable_phrase_extraction: bool,
150}
151
152impl Default for SyntaxAnalyzerConfig {
153 fn default() -> Self {
154 Self {
155 enable_pos_tagging: true,
156 enable_dependency_parsing: true,
157 enable_phrase_extraction: true,
158 }
159 }
160}
161
162pub struct SyntaxAnalyzer {
164 #[allow(dead_code)] config: SyntaxAnalyzerConfig,
166 common_nouns: HashMap<String, POSTag>,
168 common_verbs: HashMap<String, POSTag>,
169 common_adjectives: HashMap<String, POSTag>,
170 common_adverbs: HashMap<String, POSTag>,
171 prepositions: HashMap<String, POSTag>,
172 determiners: HashMap<String, POSTag>,
173 pronouns: HashMap<String, POSTag>,
174 conjunctions: HashMap<String, POSTag>,
175}
176
177impl SyntaxAnalyzer {
178 pub fn new(config: SyntaxAnalyzerConfig) -> Self {
180 Self {
181 config,
182 common_nouns: Self::build_noun_dict(),
183 common_verbs: Self::build_verb_dict(),
184 common_adjectives: Self::build_adjective_dict(),
185 common_adverbs: Self::build_adverb_dict(),
186 prepositions: Self::build_preposition_dict(),
187 determiners: Self::build_determiner_dict(),
188 pronouns: Self::build_pronoun_dict(),
189 conjunctions: Self::build_conjunction_dict(),
190 }
191 }
192
193 fn tokenize(&self, text: &str) -> Vec<(String, usize)> {
195 let mut tokens = Vec::new();
196 let mut current_word = String::new();
197 let mut word_start = 0;
198
199 for (i, ch) in text.chars().enumerate() {
200 if ch.is_alphanumeric() || ch == '\'' || ch == '-' {
201 if current_word.is_empty() {
202 word_start = i;
203 }
204 current_word.push(ch);
205 } else {
206 if !current_word.is_empty() {
207 tokens.push((current_word.clone(), word_start));
208 current_word.clear();
209 }
210 if !ch.is_whitespace() {
212 tokens.push((ch.to_string(), i));
213 }
214 }
215 }
216
217 if !current_word.is_empty() {
218 tokens.push((current_word, word_start));
219 }
220
221 tokens
222 }
223
224 pub fn pos_tag(&self, text: &str) -> Result<Vec<Token>> {
226 let raw_tokens = self.tokenize(text);
227 let mut tokens = Vec::new();
228
229 for (word, position) in raw_tokens {
230 let pos = self.tag_word(&word);
231 let lemma = self.lemmatize(&word, &pos);
232
233 tokens.push(Token {
234 text: word,
235 position,
236 pos,
237 lemma,
238 });
239 }
240
241 Ok(tokens)
242 }
243
244 fn tag_word(&self, word: &str) -> POSTag {
246 let lower = word.to_lowercase();
247
248 if word.chars().all(|c| c.is_ascii_punctuation()) {
250 return POSTag::Punctuation;
251 }
252
253 if word.chars().all(|c| c.is_ascii_digit()) {
255 return POSTag::Number;
256 }
257
258 if let Some(pos) = self.determiners.get(&lower) {
260 return pos.clone();
261 }
262 if let Some(pos) = self.pronouns.get(&lower) {
263 return pos.clone();
264 }
265 if let Some(pos) = self.prepositions.get(&lower) {
266 return pos.clone();
267 }
268 if let Some(pos) = self.conjunctions.get(&lower) {
269 return pos.clone();
270 }
271 if let Some(pos) = self.common_adverbs.get(&lower) {
272 return pos.clone();
273 }
274 if let Some(pos) = self.common_verbs.get(&lower) {
275 return pos.clone();
276 }
277 if let Some(pos) = self.common_adjectives.get(&lower) {
278 return pos.clone();
279 }
280 if let Some(pos) = self.common_nouns.get(&lower) {
281 return pos.clone();
282 }
283
284 if word.chars().next().unwrap().is_uppercase() {
287 return POSTag::ProperNoun;
288 }
289
290 if lower.ends_with("ing") {
292 return POSTag::VerbGerund;
293 }
294 if lower.ends_with("ed") {
295 return POSTag::VerbPast;
296 }
297
298 if lower.ends_with('s') && !lower.ends_with("ss") {
300 return POSTag::NounPlural;
301 }
302
303 if lower.ends_with("ive") || lower.ends_with("ous") || lower.ends_with("ful") {
305 return POSTag::Adjective;
306 }
307
308 if lower.ends_with("ly") {
310 return POSTag::Adverb;
311 }
312
313 POSTag::Noun
315 }
316
317 fn lemmatize(&self, word: &str, pos: &POSTag) -> String {
319 let lower = word.to_lowercase();
320
321 match pos {
322 POSTag::NounPlural => {
323 if lower.ends_with("ies") {
325 return format!("{}y", &lower[..lower.len() - 3]);
326 }
327 if lower.ends_with('s') && !lower.ends_with("ss") {
328 return lower[..lower.len() - 1].to_string();
329 }
330 lower
331 }
332 POSTag::VerbPast | POSTag::Verb3rdSing => {
333 if lower.ends_with("ed") {
335 return lower[..lower.len() - 2].to_string();
336 }
337 if lower.ends_with('s') {
338 return lower[..lower.len() - 1].to_string();
339 }
340 lower
341 }
342 POSTag::VerbGerund => {
343 if lower.ends_with("ing") {
345 return lower[..lower.len() - 3].to_string();
346 }
347 lower
348 }
349 _ => lower,
350 }
351 }
352
353 pub fn parse_dependencies(&self, tokens: &[Token]) -> Result<Vec<Dependency>> {
355 let mut dependencies = Vec::new();
356
357 if tokens.is_empty() {
358 return Ok(dependencies);
359 }
360
361 let root_idx = tokens
363 .iter()
364 .position(|t| matches!(t.pos, POSTag::Verb | POSTag::VerbPast | POSTag::Verb3rdSing))
365 .unwrap_or(0);
366
367 for i in 0..root_idx {
369 if matches!(
370 tokens[i].pos,
371 POSTag::Noun | POSTag::ProperNoun | POSTag::Pronoun
372 ) {
373 dependencies.push(Dependency {
374 head: root_idx,
375 dependent: i,
376 relation: DependencyRelation::Subject,
377 });
378 break;
379 }
380 }
381
382 for i in (root_idx + 1)..tokens.len() {
384 if matches!(tokens[i].pos, POSTag::Noun | POSTag::ProperNoun) {
385 dependencies.push(Dependency {
386 head: root_idx,
387 dependent: i,
388 relation: DependencyRelation::DirectObject,
389 });
390 break;
391 }
392 }
393
394 for i in 0..tokens.len() {
396 match tokens[i].pos {
397 POSTag::Adjective => {
398 if let Some(noun_idx) =
400 tokens[i + 1..].iter().position(|t| matches!(t.pos, POSTag::Noun | POSTag::ProperNoun))
401 {
402 dependencies.push(Dependency {
403 head: i + 1 + noun_idx,
404 dependent: i,
405 relation: DependencyRelation::Modifier,
406 });
407 }
408 }
409 POSTag::Adverb => {
410 let verb_idx = tokens.iter().position(|t| {
412 matches!(t.pos, POSTag::Verb | POSTag::VerbPast | POSTag::Verb3rdSing)
413 });
414 if let Some(v_idx) = verb_idx {
415 dependencies.push(Dependency {
416 head: v_idx,
417 dependent: i,
418 relation: DependencyRelation::Modifier,
419 });
420 }
421 }
422 POSTag::Determiner => {
423 if let Some(noun_idx) =
425 tokens[i + 1..].iter().position(|t| matches!(t.pos, POSTag::Noun | POSTag::ProperNoun))
426 {
427 dependencies.push(Dependency {
428 head: i + 1 + noun_idx,
429 dependent: i,
430 relation: DependencyRelation::Determiner,
431 });
432 }
433 }
434 _ => {}
435 }
436 }
437
438 Ok(dependencies)
439 }
440
441 pub fn extract_noun_phrases(&self, tokens: &[Token]) -> Result<Vec<NounPhrase>> {
443 let mut phrases = Vec::new();
444 let mut current_phrase: Vec<Token> = Vec::new();
445 let mut head_idx = 0;
446
447 for token in tokens {
448 match token.pos {
449 POSTag::Determiner | POSTag::Adjective => {
450 current_phrase.push(token.clone());
452 }
453 POSTag::Noun | POSTag::ProperNoun | POSTag::NounPlural | POSTag::ProperNounPlural => {
454 head_idx = current_phrase.len();
456 current_phrase.push(token.clone());
457 }
458 _ => {
459 if !current_phrase.is_empty() {
461 let text = current_phrase
462 .iter()
463 .map(|t| t.text.as_str())
464 .collect::<Vec<_>>()
465 .join(" ");
466
467 phrases.push(NounPhrase {
468 tokens: current_phrase.clone(),
469 head_idx,
470 text,
471 });
472
473 current_phrase.clear();
474 head_idx = 0;
475 }
476 }
477 }
478 }
479
480 if !current_phrase.is_empty() {
482 let text = current_phrase
483 .iter()
484 .map(|t| t.text.as_str())
485 .collect::<Vec<_>>()
486 .join(" ");
487
488 phrases.push(NounPhrase {
489 tokens: current_phrase,
490 head_idx,
491 text,
492 });
493 }
494
495 Ok(phrases)
496 }
497
498 pub fn segment_sentences(&self, text: &str) -> Vec<String> {
500 let sentence_regex = Regex::new(r"[.!?]+\s+").unwrap();
501 sentence_regex
502 .split(text)
503 .map(|s| s.trim().to_string())
504 .filter(|s| !s.is_empty())
505 .collect()
506 }
507
508 fn build_noun_dict() -> HashMap<String, POSTag> {
510 let nouns = vec![
511 "time", "person", "year", "way", "day", "thing", "man", "world", "life",
512 "hand", "part", "child", "eye", "woman", "place", "work", "week", "case",
513 "point", "government", "company", "number", "group", "problem", "fact",
514 ];
515 nouns.into_iter().map(|s| (s.to_string(), POSTag::Noun)).collect()
516 }
517
518 fn build_verb_dict() -> HashMap<String, POSTag> {
519 let verbs = vec![
520 "be", "have", "do", "say", "get", "make", "go", "know", "take", "see",
521 "come", "think", "look", "want", "give", "use", "find", "tell", "ask",
522 "work", "seem", "feel", "try", "leave", "call",
523 ];
524 verbs.into_iter().map(|s| (s.to_string(), POSTag::Verb)).collect()
525 }
526
527 fn build_adjective_dict() -> HashMap<String, POSTag> {
528 let adjectives = vec![
529 "good", "new", "first", "last", "long", "great", "little", "own", "other",
530 "old", "right", "big", "high", "different", "small", "large", "next",
531 "early", "young", "important", "few", "public", "bad", "same", "able",
532 ];
533 adjectives.into_iter().map(|s| (s.to_string(), POSTag::Adjective)).collect()
534 }
535
536 fn build_adverb_dict() -> HashMap<String, POSTag> {
537 let adverbs = vec![
538 "not", "so", "out", "up", "now", "only", "just", "more", "also", "very",
539 "well", "back", "there", "even", "still", "too", "here", "then", "always",
540 "never", "often", "quite", "really", "almost", "again",
541 ];
542 adverbs.into_iter().map(|s| (s.to_string(), POSTag::Adverb)).collect()
543 }
544
545 fn build_preposition_dict() -> HashMap<String, POSTag> {
546 let prepositions = vec![
547 "of", "in", "to", "for", "with", "on", "at", "from", "by", "about",
548 "into", "through", "during", "before", "after", "above", "below", "between",
549 "under", "since", "without", "within", "along", "among", "across",
550 ];
551 prepositions.into_iter().map(|s| (s.to_string(), POSTag::Preposition)).collect()
552 }
553
554 fn build_determiner_dict() -> HashMap<String, POSTag> {
555 let determiners = vec![
556 "the", "a", "an", "this", "that", "these", "those", "my", "your",
557 "his", "her", "its", "our", "their", "all", "both", "each", "every",
558 "some", "any", "no", "another", "such", "what", "which",
559 ];
560 determiners.into_iter().map(|s| (s.to_string(), POSTag::Determiner)).collect()
561 }
562
563 fn build_pronoun_dict() -> HashMap<String, POSTag> {
564 let pronouns = vec![
565 "i", "you", "he", "she", "it", "we", "they", "me", "him", "her",
566 "us", "them", "who", "whom", "what", "which", "this", "that",
567 ];
568 pronouns.into_iter().map(|s| (s.to_string(), POSTag::Pronoun)).collect()
569 }
570
571 fn build_conjunction_dict() -> HashMap<String, POSTag> {
572 let conjunctions = vec![
573 "and", "or", "but", "nor", "yet", "so", "for", "because", "although",
574 "though", "while", "if", "unless", "until", "when", "where",
575 ];
576 conjunctions.into_iter().map(|s| (s.to_string(), POSTag::Conjunction)).collect()
577 }
578}
579
580#[cfg(test)]
581mod tests {
582 use super::*;
583
584 #[test]
585 fn test_pos_tagging() {
586 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
587 let text = "The good brown fox jumps over the lazy dog.";
588
589 let tokens = analyzer.pos_tag(text).unwrap();
590
591 assert!(!tokens.is_empty());
592
593 assert_eq!(tokens[0].pos, POSTag::Determiner); assert_eq!(tokens[1].pos, POSTag::Adjective); assert!(matches!(tokens[3].pos, POSTag::Noun | POSTag::ProperNoun)); assert!(tokens.iter().any(|t| t.text == "jumps"));
599 }
600
601 #[test]
602 fn test_lemmatization() {
603 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
604
605 assert_eq!(analyzer.lemmatize("running", &POSTag::VerbGerund), "runn");
606 assert_eq!(analyzer.lemmatize("cats", &POSTag::NounPlural), "cat");
607 assert_eq!(analyzer.lemmatize("jumped", &POSTag::VerbPast), "jump");
608 }
609
610 #[test]
611 fn test_noun_phrase_extraction() {
612 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
613 let text = "The quick brown fox";
614
615 let tokens = analyzer.pos_tag(text).unwrap();
616 let phrases = analyzer.extract_noun_phrases(&tokens).unwrap();
617
618 assert_eq!(phrases.len(), 1);
619 assert_eq!(phrases[0].text, "The quick brown fox");
620 }
621
622 #[test]
623 fn test_dependency_parsing() {
624 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
625 let text = "The cat chased the mouse";
626
627 let tokens = analyzer.pos_tag(text).unwrap();
628 let deps = analyzer.parse_dependencies(&tokens).unwrap();
629
630 assert!(!deps.is_empty());
632
633 let has_subject = deps.iter().any(|d| matches!(d.relation, DependencyRelation::Subject));
635 assert!(has_subject, "Should have subject dependency");
636 }
637
638 #[test]
639 fn test_sentence_segmentation() {
640 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
641 let text = "This is sentence one. This is sentence two! And sentence three?";
642
643 let sentences = analyzer.segment_sentences(text);
644
645 assert_eq!(sentences.len(), 3);
646 assert!(sentences[0].contains("sentence one"));
647 assert!(sentences[1].contains("sentence two"));
648 assert!(sentences[2].contains("sentence three"));
649 }
650
651 #[test]
652 fn test_tokenization() {
653 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
654 let text = "Hello, world!";
655
656 let tokens = analyzer.tokenize(text);
657
658 assert_eq!(tokens.len(), 4); assert_eq!(tokens[0].0, "Hello");
660 assert_eq!(tokens[1].0, ",");
661 }
662
663 #[test]
664 fn test_proper_noun_detection() {
665 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
666 let text = "John Smith lives in New York";
667
668 let tokens = analyzer.pos_tag(text).unwrap();
669
670 let proper_nouns: Vec<_> = tokens.iter()
672 .filter(|t| matches!(t.pos, POSTag::ProperNoun))
673 .collect();
674
675 assert!(!proper_nouns.is_empty());
676 }
677}