Skip to main content

graphrag_core/nlp/
syntax_analyzer.rs

1//! Rule-based Syntax Analysis
2//!
3//! Deterministic POS tagging and dependency parsing without ML models.
4//! Lightweight implementation using pattern matching and linguistic rules.
5//!
6//! Features:
7//! - Part-of-Speech (POS) tagging
8//! - Dependency parsing (simplified)
9//! - Phrase extraction (noun phrases, verb phrases)
10//! - Sentence segmentation
11//! - Token classification
12
13use crate::Result;
14use regex::Regex;
15use std::collections::HashMap;
16
17/// Part-of-Speech tag
18#[derive(Debug, Clone, PartialEq, Eq, Hash)]
19pub enum POSTag {
20    /// Noun (singular or mass)
21    Noun,
22    /// Noun, plural
23    NounPlural,
24    /// Proper noun, singular
25    ProperNoun,
26    /// Proper noun, plural
27    ProperNounPlural,
28    /// Verb, base form
29    Verb,
30    /// Verb, past tense
31    VerbPast,
32    /// Verb, gerund or present participle
33    VerbGerund,
34    /// Verb, 3rd person singular present
35    Verb3rdSing,
36    /// Adjective
37    Adjective,
38    /// Adverb
39    Adverb,
40    /// Preposition or subordinating conjunction
41    Preposition,
42    /// Determiner
43    Determiner,
44    /// Pronoun
45    Pronoun,
46    /// Conjunction, coordinating
47    Conjunction,
48    /// Punctuation
49    Punctuation,
50    /// Number
51    Number,
52    /// Unknown/Other
53    Unknown,
54}
55
56impl POSTag {
57    /// Get Penn Treebank tag string
58    pub fn penn_tag(&self) -> &str {
59        match self {
60            POSTag::Noun => "NN",
61            POSTag::NounPlural => "NNS",
62            POSTag::ProperNoun => "NNP",
63            POSTag::ProperNounPlural => "NNPS",
64            POSTag::Verb => "VB",
65            POSTag::VerbPast => "VBD",
66            POSTag::VerbGerund => "VBG",
67            POSTag::Verb3rdSing => "VBZ",
68            POSTag::Adjective => "JJ",
69            POSTag::Adverb => "RB",
70            POSTag::Preposition => "IN",
71            POSTag::Determiner => "DT",
72            POSTag::Pronoun => "PRP",
73            POSTag::Conjunction => "CC",
74            POSTag::Punctuation => ".",
75            POSTag::Number => "CD",
76            POSTag::Unknown => "UNK",
77        }
78    }
79}
80
81/// Dependency relation type
82#[derive(Debug, Clone, PartialEq, Eq)]
83pub enum DependencyRelation {
84    /// Subject of a verb
85    Subject,
86    /// Direct object of a verb
87    DirectObject,
88    /// Indirect object
89    IndirectObject,
90    /// Modifier (adjective/adverb)
91    Modifier,
92    /// Determiner
93    Determiner,
94    /// Prepositional modifier
95    PrepositionalModifier,
96    /// Conjunction
97    Conjunction,
98    /// Complement
99    Complement,
100    /// Root of the sentence
101    Root,
102    /// Unknown relation
103    Unknown,
104}
105
106/// A token with POS tag
107#[derive(Debug, Clone)]
108pub struct Token {
109    /// The text of the token
110    pub text: String,
111    /// Position in the original text
112    pub position: usize,
113    /// POS tag
114    pub pos: POSTag,
115    /// Lemma (base form)
116    pub lemma: String,
117}
118
119/// A dependency arc between tokens
120#[derive(Debug, Clone)]
121pub struct Dependency {
122    /// Index of the head token
123    pub head: usize,
124    /// Index of the dependent token
125    pub dependent: usize,
126    /// Type of dependency relation
127    pub relation: DependencyRelation,
128}
129
130/// A noun phrase
131#[derive(Debug, Clone)]
132pub struct NounPhrase {
133    /// Tokens in the noun phrase
134    pub tokens: Vec<Token>,
135    /// Head noun index within tokens
136    pub head_idx: usize,
137    /// Text span
138    pub text: String,
139}
140
141/// Configuration for syntax analyzer
142#[derive(Debug, Clone)]
143pub struct SyntaxAnalyzerConfig {
144    /// Enable POS tagging
145    pub enable_pos_tagging: bool,
146    /// Enable dependency parsing
147    pub enable_dependency_parsing: bool,
148    /// Enable phrase extraction
149    pub enable_phrase_extraction: bool,
150}
151
152impl Default for SyntaxAnalyzerConfig {
153    fn default() -> Self {
154        Self {
155            enable_pos_tagging: true,
156            enable_dependency_parsing: true,
157            enable_phrase_extraction: true,
158        }
159    }
160}
161
162/// Rule-based syntax analyzer
163pub struct SyntaxAnalyzer {
164    #[allow(dead_code)] // Reserved for future conditional feature flags
165    config: SyntaxAnalyzerConfig,
166    // Lookup tables for POS tagging
167    common_nouns: HashMap<String, POSTag>,
168    common_verbs: HashMap<String, POSTag>,
169    common_adjectives: HashMap<String, POSTag>,
170    common_adverbs: HashMap<String, POSTag>,
171    prepositions: HashMap<String, POSTag>,
172    determiners: HashMap<String, POSTag>,
173    pronouns: HashMap<String, POSTag>,
174    conjunctions: HashMap<String, POSTag>,
175}
176
177impl SyntaxAnalyzer {
178    /// Create a new syntax analyzer
179    pub fn new(config: SyntaxAnalyzerConfig) -> Self {
180        Self {
181            config,
182            common_nouns: Self::build_noun_dict(),
183            common_verbs: Self::build_verb_dict(),
184            common_adjectives: Self::build_adjective_dict(),
185            common_adverbs: Self::build_adverb_dict(),
186            prepositions: Self::build_preposition_dict(),
187            determiners: Self::build_determiner_dict(),
188            pronouns: Self::build_pronoun_dict(),
189            conjunctions: Self::build_conjunction_dict(),
190        }
191    }
192
193    /// Tokenize text into words
194    fn tokenize(&self, text: &str) -> Vec<(String, usize)> {
195        let mut tokens = Vec::new();
196        let mut current_word = String::new();
197        let mut word_start = 0;
198
199        for (i, ch) in text.chars().enumerate() {
200            if ch.is_alphanumeric() || ch == '\'' || ch == '-' {
201                if current_word.is_empty() {
202                    word_start = i;
203                }
204                current_word.push(ch);
205            } else {
206                if !current_word.is_empty() {
207                    tokens.push((current_word.clone(), word_start));
208                    current_word.clear();
209                }
210                // Add punctuation as separate tokens
211                if !ch.is_whitespace() {
212                    tokens.push((ch.to_string(), i));
213                }
214            }
215        }
216
217        if !current_word.is_empty() {
218            tokens.push((current_word, word_start));
219        }
220
221        tokens
222    }
223
224    /// Perform POS tagging on text
225    pub fn pos_tag(&self, text: &str) -> Result<Vec<Token>> {
226        let raw_tokens = self.tokenize(text);
227        let mut tokens = Vec::new();
228
229        for (word, position) in raw_tokens {
230            let pos = self.tag_word(&word);
231            let lemma = self.lemmatize(&word, &pos);
232
233            tokens.push(Token {
234                text: word,
235                position,
236                pos,
237                lemma,
238            });
239        }
240
241        Ok(tokens)
242    }
243
244    /// Tag a single word with POS
245    fn tag_word(&self, word: &str) -> POSTag {
246        let lower = word.to_lowercase();
247
248        // Check punctuation
249        if word.chars().all(|c| c.is_ascii_punctuation()) {
250            return POSTag::Punctuation;
251        }
252
253        // Check numbers
254        if word.chars().all(|c| c.is_ascii_digit()) {
255            return POSTag::Number;
256        }
257
258        // Check dictionaries
259        if let Some(pos) = self.determiners.get(&lower) {
260            return pos.clone();
261        }
262        if let Some(pos) = self.pronouns.get(&lower) {
263            return pos.clone();
264        }
265        if let Some(pos) = self.prepositions.get(&lower) {
266            return pos.clone();
267        }
268        if let Some(pos) = self.conjunctions.get(&lower) {
269            return pos.clone();
270        }
271        if let Some(pos) = self.common_adverbs.get(&lower) {
272            return pos.clone();
273        }
274        if let Some(pos) = self.common_verbs.get(&lower) {
275            return pos.clone();
276        }
277        if let Some(pos) = self.common_adjectives.get(&lower) {
278            return pos.clone();
279        }
280        if let Some(pos) = self.common_nouns.get(&lower) {
281            return pos.clone();
282        }
283
284        // Pattern-based tagging
285        // Proper noun: capitalized and not at start of sentence
286        if word.chars().next().unwrap().is_uppercase() {
287            return POSTag::ProperNoun;
288        }
289
290        // Verb patterns
291        if lower.ends_with("ing") {
292            return POSTag::VerbGerund;
293        }
294        if lower.ends_with("ed") {
295            return POSTag::VerbPast;
296        }
297
298        // Noun patterns (plural)
299        if lower.ends_with('s') && !lower.ends_with("ss") {
300            return POSTag::NounPlural;
301        }
302
303        // Adjective patterns
304        if lower.ends_with("ive") || lower.ends_with("ous") || lower.ends_with("ful") {
305            return POSTag::Adjective;
306        }
307
308        // Adverb patterns
309        if lower.ends_with("ly") {
310            return POSTag::Adverb;
311        }
312
313        // Default to noun
314        POSTag::Noun
315    }
316
317    /// Simple lemmatization
318    fn lemmatize(&self, word: &str, pos: &POSTag) -> String {
319        let lower = word.to_lowercase();
320
321        match pos {
322            POSTag::NounPlural => {
323                // Remove plural 's'
324                if lower.ends_with("ies") {
325                    return format!("{}y", &lower[..lower.len() - 3]);
326                }
327                if lower.ends_with('s') && !lower.ends_with("ss") {
328                    return lower[..lower.len() - 1].to_string();
329                }
330                lower
331            },
332            POSTag::VerbPast | POSTag::Verb3rdSing => {
333                // Remove -ed, -s
334                if lower.ends_with("ed") {
335                    return lower[..lower.len() - 2].to_string();
336                }
337                if lower.ends_with('s') {
338                    return lower[..lower.len() - 1].to_string();
339                }
340                lower
341            },
342            POSTag::VerbGerund => {
343                // Remove -ing
344                if lower.ends_with("ing") {
345                    return lower[..lower.len() - 3].to_string();
346                }
347                lower
348            },
349            _ => lower,
350        }
351    }
352
353    /// Parse dependencies (simplified)
354    pub fn parse_dependencies(&self, tokens: &[Token]) -> Result<Vec<Dependency>> {
355        let mut dependencies = Vec::new();
356
357        if tokens.is_empty() {
358            return Ok(dependencies);
359        }
360
361        // Find the main verb (root)
362        let root_idx = tokens
363            .iter()
364            .position(|t| matches!(t.pos, POSTag::Verb | POSTag::VerbPast | POSTag::Verb3rdSing))
365            .unwrap_or(0);
366
367        // Find subject (noun/pronoun before verb)
368        #[allow(clippy::needless_range_loop)]
369        for i in 0..root_idx {
370            if matches!(
371                tokens[i].pos,
372                POSTag::Noun | POSTag::ProperNoun | POSTag::Pronoun
373            ) {
374                dependencies.push(Dependency {
375                    head: root_idx,
376                    dependent: i,
377                    relation: DependencyRelation::Subject,
378                });
379                break;
380            }
381        }
382
383        // Find object (noun after verb)
384        #[allow(clippy::needless_range_loop)]
385        for i in (root_idx + 1)..tokens.len() {
386            if matches!(tokens[i].pos, POSTag::Noun | POSTag::ProperNoun) {
387                dependencies.push(Dependency {
388                    head: root_idx,
389                    dependent: i,
390                    relation: DependencyRelation::DirectObject,
391                });
392                break;
393            }
394        }
395
396        // Find modifiers (adjectives before nouns, adverbs near verbs)
397        for i in 0..tokens.len() {
398            match tokens[i].pos {
399                POSTag::Adjective => {
400                    // Find next noun
401                    if let Some(noun_idx) = tokens[i + 1..]
402                        .iter()
403                        .position(|t| matches!(t.pos, POSTag::Noun | POSTag::ProperNoun))
404                    {
405                        dependencies.push(Dependency {
406                            head: i + 1 + noun_idx,
407                            dependent: i,
408                            relation: DependencyRelation::Modifier,
409                        });
410                    }
411                },
412                POSTag::Adverb => {
413                    // Modify nearest verb
414                    let verb_idx = tokens.iter().position(|t| {
415                        matches!(t.pos, POSTag::Verb | POSTag::VerbPast | POSTag::Verb3rdSing)
416                    });
417                    if let Some(v_idx) = verb_idx {
418                        dependencies.push(Dependency {
419                            head: v_idx,
420                            dependent: i,
421                            relation: DependencyRelation::Modifier,
422                        });
423                    }
424                },
425                POSTag::Determiner => {
426                    // Determine next noun
427                    if let Some(noun_idx) = tokens[i + 1..]
428                        .iter()
429                        .position(|t| matches!(t.pos, POSTag::Noun | POSTag::ProperNoun))
430                    {
431                        dependencies.push(Dependency {
432                            head: i + 1 + noun_idx,
433                            dependent: i,
434                            relation: DependencyRelation::Determiner,
435                        });
436                    }
437                },
438                _ => {},
439            }
440        }
441
442        Ok(dependencies)
443    }
444
445    /// Extract noun phrases
446    pub fn extract_noun_phrases(&self, tokens: &[Token]) -> Result<Vec<NounPhrase>> {
447        let mut phrases = Vec::new();
448        let mut current_phrase: Vec<Token> = Vec::new();
449        let mut head_idx = 0;
450
451        for token in tokens {
452            match token.pos {
453                POSTag::Determiner | POSTag::Adjective => {
454                    // Start or continue noun phrase
455                    current_phrase.push(token.clone());
456                },
457                POSTag::Noun
458                | POSTag::ProperNoun
459                | POSTag::NounPlural
460                | POSTag::ProperNounPlural => {
461                    // Add noun to phrase
462                    head_idx = current_phrase.len();
463                    current_phrase.push(token.clone());
464                },
465                _ => {
466                    // End of noun phrase
467                    if !current_phrase.is_empty() {
468                        let text = current_phrase
469                            .iter()
470                            .map(|t| t.text.as_str())
471                            .collect::<Vec<_>>()
472                            .join(" ");
473
474                        phrases.push(NounPhrase {
475                            tokens: current_phrase.clone(),
476                            head_idx,
477                            text,
478                        });
479
480                        current_phrase.clear();
481                        head_idx = 0;
482                    }
483                },
484            }
485        }
486
487        // Add final phrase if exists
488        if !current_phrase.is_empty() {
489            let text = current_phrase
490                .iter()
491                .map(|t| t.text.as_str())
492                .collect::<Vec<_>>()
493                .join(" ");
494
495            phrases.push(NounPhrase {
496                tokens: current_phrase,
497                head_idx,
498                text,
499            });
500        }
501
502        Ok(phrases)
503    }
504
505    /// Segment text into sentences
506    pub fn segment_sentences(&self, text: &str) -> Vec<String> {
507        let sentence_regex = Regex::new(r"[.!?]+\s+").unwrap();
508        sentence_regex
509            .split(text)
510            .map(|s| s.trim().to_string())
511            .filter(|s| !s.is_empty())
512            .collect()
513    }
514
515    // Dictionary builders
516    fn build_noun_dict() -> HashMap<String, POSTag> {
517        let nouns = vec![
518            "time",
519            "person",
520            "year",
521            "way",
522            "day",
523            "thing",
524            "man",
525            "world",
526            "life",
527            "hand",
528            "part",
529            "child",
530            "eye",
531            "woman",
532            "place",
533            "work",
534            "week",
535            "case",
536            "point",
537            "government",
538            "company",
539            "number",
540            "group",
541            "problem",
542            "fact",
543        ];
544        nouns
545            .into_iter()
546            .map(|s| (s.to_string(), POSTag::Noun))
547            .collect()
548    }
549
550    fn build_verb_dict() -> HashMap<String, POSTag> {
551        let verbs = vec![
552            "be", "have", "do", "say", "get", "make", "go", "know", "take", "see", "come", "think",
553            "look", "want", "give", "use", "find", "tell", "ask", "work", "seem", "feel", "try",
554            "leave", "call",
555        ];
556        verbs
557            .into_iter()
558            .map(|s| (s.to_string(), POSTag::Verb))
559            .collect()
560    }
561
562    fn build_adjective_dict() -> HashMap<String, POSTag> {
563        let adjectives = vec![
564            "good",
565            "new",
566            "first",
567            "last",
568            "long",
569            "great",
570            "little",
571            "own",
572            "other",
573            "old",
574            "right",
575            "big",
576            "high",
577            "different",
578            "small",
579            "large",
580            "next",
581            "early",
582            "young",
583            "important",
584            "few",
585            "public",
586            "bad",
587            "same",
588            "able",
589        ];
590        adjectives
591            .into_iter()
592            .map(|s| (s.to_string(), POSTag::Adjective))
593            .collect()
594    }
595
596    fn build_adverb_dict() -> HashMap<String, POSTag> {
597        let adverbs = vec![
598            "not", "so", "out", "up", "now", "only", "just", "more", "also", "very", "well",
599            "back", "there", "even", "still", "too", "here", "then", "always", "never", "often",
600            "quite", "really", "almost", "again",
601        ];
602        adverbs
603            .into_iter()
604            .map(|s| (s.to_string(), POSTag::Adverb))
605            .collect()
606    }
607
608    fn build_preposition_dict() -> HashMap<String, POSTag> {
609        let prepositions = vec![
610            "of", "in", "to", "for", "with", "on", "at", "from", "by", "about", "into", "through",
611            "during", "before", "after", "above", "below", "between", "under", "since", "without",
612            "within", "along", "among", "across",
613        ];
614        prepositions
615            .into_iter()
616            .map(|s| (s.to_string(), POSTag::Preposition))
617            .collect()
618    }
619
620    fn build_determiner_dict() -> HashMap<String, POSTag> {
621        let determiners = vec![
622            "the", "a", "an", "this", "that", "these", "those", "my", "your", "his", "her", "its",
623            "our", "their", "all", "both", "each", "every", "some", "any", "no", "another", "such",
624            "what", "which",
625        ];
626        determiners
627            .into_iter()
628            .map(|s| (s.to_string(), POSTag::Determiner))
629            .collect()
630    }
631
632    fn build_pronoun_dict() -> HashMap<String, POSTag> {
633        let pronouns = vec![
634            "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them", "who",
635            "whom", "what", "which", "this", "that",
636        ];
637        pronouns
638            .into_iter()
639            .map(|s| (s.to_string(), POSTag::Pronoun))
640            .collect()
641    }
642
643    fn build_conjunction_dict() -> HashMap<String, POSTag> {
644        let conjunctions = vec![
645            "and", "or", "but", "nor", "yet", "so", "for", "because", "although", "though",
646            "while", "if", "unless", "until", "when", "where",
647        ];
648        conjunctions
649            .into_iter()
650            .map(|s| (s.to_string(), POSTag::Conjunction))
651            .collect()
652    }
653}
654
655#[cfg(test)]
656mod tests {
657    use super::*;
658
659    #[test]
660    fn test_pos_tagging() {
661        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
662        let text = "The good brown fox jumps over the lazy dog.";
663
664        let tokens = analyzer.pos_tag(text).unwrap();
665
666        assert!(!tokens.is_empty());
667
668        // Check some expected tags
669        assert_eq!(tokens[0].pos, POSTag::Determiner); // "The"
670        assert_eq!(tokens[1].pos, POSTag::Adjective); // "good" (in dictionary)
671        assert!(matches!(tokens[3].pos, POSTag::Noun | POSTag::ProperNoun)); // "fox"
672                                                                             // "jumps" ends with 's' but may be tagged as plural noun, so we check it's present
673        assert!(tokens.iter().any(|t| t.text == "jumps"));
674    }
675
676    #[test]
677    fn test_lemmatization() {
678        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
679
680        assert_eq!(analyzer.lemmatize("running", &POSTag::VerbGerund), "runn");
681        assert_eq!(analyzer.lemmatize("cats", &POSTag::NounPlural), "cat");
682        assert_eq!(analyzer.lemmatize("jumped", &POSTag::VerbPast), "jump");
683    }
684
685    #[test]
686    fn test_noun_phrase_extraction() {
687        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
688        let text = "The quick brown fox";
689
690        let tokens = analyzer.pos_tag(text).unwrap();
691        let phrases = analyzer.extract_noun_phrases(&tokens).unwrap();
692
693        assert_eq!(phrases.len(), 1);
694        assert_eq!(phrases[0].text, "The quick brown fox");
695    }
696
697    #[test]
698    fn test_dependency_parsing() {
699        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
700        let text = "The cat chased the mouse";
701
702        let tokens = analyzer.pos_tag(text).unwrap();
703        let deps = analyzer.parse_dependencies(&tokens).unwrap();
704
705        // Should have subject and object dependencies
706        assert!(!deps.is_empty());
707
708        // Find subject dependency
709        let has_subject = deps
710            .iter()
711            .any(|d| matches!(d.relation, DependencyRelation::Subject));
712        assert!(has_subject, "Should have subject dependency");
713    }
714
715    #[test]
716    fn test_sentence_segmentation() {
717        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
718        let text = "This is sentence one. This is sentence two! And sentence three?";
719
720        let sentences = analyzer.segment_sentences(text);
721
722        assert_eq!(sentences.len(), 3);
723        assert!(sentences[0].contains("sentence one"));
724        assert!(sentences[1].contains("sentence two"));
725        assert!(sentences[2].contains("sentence three"));
726    }
727
728    #[test]
729    fn test_tokenization() {
730        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
731        let text = "Hello, world!";
732
733        let tokens = analyzer.tokenize(text);
734
735        assert_eq!(tokens.len(), 4); // "Hello", ",", "world", "!"
736        assert_eq!(tokens[0].0, "Hello");
737        assert_eq!(tokens[1].0, ",");
738    }
739
740    #[test]
741    fn test_proper_noun_detection() {
742        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
743        let text = "John Smith lives in New York";
744
745        let tokens = analyzer.pos_tag(text).unwrap();
746
747        // Should detect proper nouns
748        let proper_nouns: Vec<_> = tokens
749            .iter()
750            .filter(|t| matches!(t.pos, POSTag::ProperNoun))
751            .collect();
752
753        assert!(!proper_nouns.is_empty());
754    }
755}