Skip to main content

graphrag_core/nlp/
syntax_analyzer.rs

1//! Rule-based Syntax Analysis
2//!
3//! Deterministic POS tagging and dependency parsing without ML models.
4//! Lightweight implementation using pattern matching and linguistic rules.
5//!
6//! Features:
7//! - Part-of-Speech (POS) tagging
8//! - Dependency parsing (simplified)
9//! - Phrase extraction (noun phrases, verb phrases)
10//! - Sentence segmentation
11//! - Token classification
12
13use crate::Result;
14use regex::Regex;
15use std::collections::HashMap;
16
17/// Part-of-Speech tag
18#[derive(Debug, Clone, PartialEq, Eq, Hash)]
19pub enum POSTag {
20    /// Noun (singular or mass)
21    Noun,
22    /// Noun, plural
23    NounPlural,
24    /// Proper noun, singular
25    ProperNoun,
26    /// Proper noun, plural
27    ProperNounPlural,
28    /// Verb, base form
29    Verb,
30    /// Verb, past tense
31    VerbPast,
32    /// Verb, gerund or present participle
33    VerbGerund,
34    /// Verb, 3rd person singular present
35    Verb3rdSing,
36    /// Adjective
37    Adjective,
38    /// Adverb
39    Adverb,
40    /// Preposition or subordinating conjunction
41    Preposition,
42    /// Determiner
43    Determiner,
44    /// Pronoun
45    Pronoun,
46    /// Conjunction, coordinating
47    Conjunction,
48    /// Punctuation
49    Punctuation,
50    /// Number
51    Number,
52    /// Unknown/Other
53    Unknown,
54}
55
56impl POSTag {
57    /// Get Penn Treebank tag string
58    pub fn penn_tag(&self) -> &str {
59        match self {
60            POSTag::Noun => "NN",
61            POSTag::NounPlural => "NNS",
62            POSTag::ProperNoun => "NNP",
63            POSTag::ProperNounPlural => "NNPS",
64            POSTag::Verb => "VB",
65            POSTag::VerbPast => "VBD",
66            POSTag::VerbGerund => "VBG",
67            POSTag::Verb3rdSing => "VBZ",
68            POSTag::Adjective => "JJ",
69            POSTag::Adverb => "RB",
70            POSTag::Preposition => "IN",
71            POSTag::Determiner => "DT",
72            POSTag::Pronoun => "PRP",
73            POSTag::Conjunction => "CC",
74            POSTag::Punctuation => ".",
75            POSTag::Number => "CD",
76            POSTag::Unknown => "UNK",
77        }
78    }
79}
80
81/// Dependency relation type
82#[derive(Debug, Clone, PartialEq, Eq)]
83pub enum DependencyRelation {
84    /// Subject of a verb
85    Subject,
86    /// Direct object of a verb
87    DirectObject,
88    /// Indirect object
89    IndirectObject,
90    /// Modifier (adjective/adverb)
91    Modifier,
92    /// Determiner
93    Determiner,
94    /// Prepositional modifier
95    PrepositionalModifier,
96    /// Conjunction
97    Conjunction,
98    /// Complement
99    Complement,
100    /// Root of the sentence
101    Root,
102    /// Unknown relation
103    Unknown,
104}
105
106/// A token with POS tag
107#[derive(Debug, Clone)]
108pub struct Token {
109    /// The text of the token
110    pub text: String,
111    /// Position in the original text
112    pub position: usize,
113    /// POS tag
114    pub pos: POSTag,
115    /// Lemma (base form)
116    pub lemma: String,
117}
118
119/// A dependency arc between tokens
120#[derive(Debug, Clone)]
121pub struct Dependency {
122    /// Index of the head token
123    pub head: usize,
124    /// Index of the dependent token
125    pub dependent: usize,
126    /// Type of dependency relation
127    pub relation: DependencyRelation,
128}
129
130/// A noun phrase
131#[derive(Debug, Clone)]
132pub struct NounPhrase {
133    /// Tokens in the noun phrase
134    pub tokens: Vec<Token>,
135    /// Head noun index within tokens
136    pub head_idx: usize,
137    /// Text span
138    pub text: String,
139}
140
141/// Configuration for syntax analyzer
142#[derive(Debug, Clone)]
143pub struct SyntaxAnalyzerConfig {
144    /// Enable POS tagging
145    pub enable_pos_tagging: bool,
146    /// Enable dependency parsing
147    pub enable_dependency_parsing: bool,
148    /// Enable phrase extraction
149    pub enable_phrase_extraction: bool,
150}
151
152impl Default for SyntaxAnalyzerConfig {
153    fn default() -> Self {
154        Self {
155            enable_pos_tagging: true,
156            enable_dependency_parsing: true,
157            enable_phrase_extraction: true,
158        }
159    }
160}
161
162/// Rule-based syntax analyzer
163pub struct SyntaxAnalyzer {
164    #[allow(dead_code)] // Reserved for future conditional feature flags
165    config: SyntaxAnalyzerConfig,
166    // Lookup tables for POS tagging
167    common_nouns: HashMap<String, POSTag>,
168    common_verbs: HashMap<String, POSTag>,
169    common_adjectives: HashMap<String, POSTag>,
170    common_adverbs: HashMap<String, POSTag>,
171    prepositions: HashMap<String, POSTag>,
172    determiners: HashMap<String, POSTag>,
173    pronouns: HashMap<String, POSTag>,
174    conjunctions: HashMap<String, POSTag>,
175}
176
177impl SyntaxAnalyzer {
178    /// Create a new syntax analyzer
179    pub fn new(config: SyntaxAnalyzerConfig) -> Self {
180        Self {
181            config,
182            common_nouns: Self::build_noun_dict(),
183            common_verbs: Self::build_verb_dict(),
184            common_adjectives: Self::build_adjective_dict(),
185            common_adverbs: Self::build_adverb_dict(),
186            prepositions: Self::build_preposition_dict(),
187            determiners: Self::build_determiner_dict(),
188            pronouns: Self::build_pronoun_dict(),
189            conjunctions: Self::build_conjunction_dict(),
190        }
191    }
192
193    /// Tokenize text into words
194    fn tokenize(&self, text: &str) -> Vec<(String, usize)> {
195        let mut tokens = Vec::new();
196        let mut current_word = String::new();
197        let mut word_start = 0;
198
199        for (i, ch) in text.chars().enumerate() {
200            if ch.is_alphanumeric() || ch == '\'' || ch == '-' {
201                if current_word.is_empty() {
202                    word_start = i;
203                }
204                current_word.push(ch);
205            } else {
206                if !current_word.is_empty() {
207                    tokens.push((current_word.clone(), word_start));
208                    current_word.clear();
209                }
210                // Add punctuation as separate tokens
211                if !ch.is_whitespace() {
212                    tokens.push((ch.to_string(), i));
213                }
214            }
215        }
216
217        if !current_word.is_empty() {
218            tokens.push((current_word, word_start));
219        }
220
221        tokens
222    }
223
224    /// Perform POS tagging on text
225    pub fn pos_tag(&self, text: &str) -> Result<Vec<Token>> {
226        let raw_tokens = self.tokenize(text);
227        let mut tokens = Vec::new();
228
229        for (word, position) in raw_tokens {
230            let pos = self.tag_word(&word);
231            let lemma = self.lemmatize(&word, &pos);
232
233            tokens.push(Token {
234                text: word,
235                position,
236                pos,
237                lemma,
238            });
239        }
240
241        Ok(tokens)
242    }
243
244    /// Tag a single word with POS
245    fn tag_word(&self, word: &str) -> POSTag {
246        let lower = word.to_lowercase();
247
248        // Check punctuation
249        if word.chars().all(|c| c.is_ascii_punctuation()) {
250            return POSTag::Punctuation;
251        }
252
253        // Check numbers
254        if word.chars().all(|c| c.is_ascii_digit()) {
255            return POSTag::Number;
256        }
257
258        // Check dictionaries
259        if let Some(pos) = self.determiners.get(&lower) {
260            return pos.clone();
261        }
262        if let Some(pos) = self.pronouns.get(&lower) {
263            return pos.clone();
264        }
265        if let Some(pos) = self.prepositions.get(&lower) {
266            return pos.clone();
267        }
268        if let Some(pos) = self.conjunctions.get(&lower) {
269            return pos.clone();
270        }
271        if let Some(pos) = self.common_adverbs.get(&lower) {
272            return pos.clone();
273        }
274        if let Some(pos) = self.common_verbs.get(&lower) {
275            return pos.clone();
276        }
277        if let Some(pos) = self.common_adjectives.get(&lower) {
278            return pos.clone();
279        }
280        if let Some(pos) = self.common_nouns.get(&lower) {
281            return pos.clone();
282        }
283
284        // Pattern-based tagging
285        // Proper noun: capitalized and not at start of sentence
286        if word
287            .chars()
288            .next()
289            .expect("non-empty string")
290            .is_uppercase()
291        {
292            return POSTag::ProperNoun;
293        }
294
295        // Verb patterns
296        if lower.ends_with("ing") {
297            return POSTag::VerbGerund;
298        }
299        if lower.ends_with("ed") {
300            return POSTag::VerbPast;
301        }
302
303        // Noun patterns (plural)
304        if lower.ends_with('s') && !lower.ends_with("ss") {
305            return POSTag::NounPlural;
306        }
307
308        // Adjective patterns
309        if lower.ends_with("ive") || lower.ends_with("ous") || lower.ends_with("ful") {
310            return POSTag::Adjective;
311        }
312
313        // Adverb patterns
314        if lower.ends_with("ly") {
315            return POSTag::Adverb;
316        }
317
318        // Default to noun
319        POSTag::Noun
320    }
321
322    /// Simple lemmatization
323    fn lemmatize(&self, word: &str, pos: &POSTag) -> String {
324        let lower = word.to_lowercase();
325
326        match pos {
327            POSTag::NounPlural => {
328                // Remove plural 's'
329                if lower.ends_with("ies") {
330                    return format!("{}y", &lower[..lower.len() - 3]);
331                }
332                if lower.ends_with('s') && !lower.ends_with("ss") {
333                    return lower[..lower.len() - 1].to_string();
334                }
335                lower
336            },
337            POSTag::VerbPast | POSTag::Verb3rdSing => {
338                // Remove -ed, -s
339                if lower.ends_with("ed") {
340                    return lower[..lower.len() - 2].to_string();
341                }
342                if lower.ends_with('s') {
343                    return lower[..lower.len() - 1].to_string();
344                }
345                lower
346            },
347            POSTag::VerbGerund => {
348                // Remove -ing
349                if lower.ends_with("ing") {
350                    return lower[..lower.len() - 3].to_string();
351                }
352                lower
353            },
354            _ => lower,
355        }
356    }
357
358    /// Parse dependencies (simplified)
359    pub fn parse_dependencies(&self, tokens: &[Token]) -> Result<Vec<Dependency>> {
360        let mut dependencies = Vec::new();
361
362        if tokens.is_empty() {
363            return Ok(dependencies);
364        }
365
366        // Find the main verb (root)
367        let root_idx = tokens
368            .iter()
369            .position(|t| matches!(t.pos, POSTag::Verb | POSTag::VerbPast | POSTag::Verb3rdSing))
370            .unwrap_or(0);
371
372        // Find subject (noun/pronoun before verb)
373        #[allow(clippy::needless_range_loop)]
374        for i in 0..root_idx {
375            if matches!(
376                tokens[i].pos,
377                POSTag::Noun | POSTag::ProperNoun | POSTag::Pronoun
378            ) {
379                dependencies.push(Dependency {
380                    head: root_idx,
381                    dependent: i,
382                    relation: DependencyRelation::Subject,
383                });
384                break;
385            }
386        }
387
388        // Find object (noun after verb)
389        #[allow(clippy::needless_range_loop)]
390        for i in (root_idx + 1)..tokens.len() {
391            if matches!(tokens[i].pos, POSTag::Noun | POSTag::ProperNoun) {
392                dependencies.push(Dependency {
393                    head: root_idx,
394                    dependent: i,
395                    relation: DependencyRelation::DirectObject,
396                });
397                break;
398            }
399        }
400
401        // Find modifiers (adjectives before nouns, adverbs near verbs)
402        for i in 0..tokens.len() {
403            match tokens[i].pos {
404                POSTag::Adjective => {
405                    // Find next noun
406                    if let Some(noun_idx) = tokens[i + 1..]
407                        .iter()
408                        .position(|t| matches!(t.pos, POSTag::Noun | POSTag::ProperNoun))
409                    {
410                        dependencies.push(Dependency {
411                            head: i + 1 + noun_idx,
412                            dependent: i,
413                            relation: DependencyRelation::Modifier,
414                        });
415                    }
416                },
417                POSTag::Adverb => {
418                    // Modify nearest verb
419                    let verb_idx = tokens.iter().position(|t| {
420                        matches!(t.pos, POSTag::Verb | POSTag::VerbPast | POSTag::Verb3rdSing)
421                    });
422                    if let Some(v_idx) = verb_idx {
423                        dependencies.push(Dependency {
424                            head: v_idx,
425                            dependent: i,
426                            relation: DependencyRelation::Modifier,
427                        });
428                    }
429                },
430                POSTag::Determiner => {
431                    // Determine next noun
432                    if let Some(noun_idx) = tokens[i + 1..]
433                        .iter()
434                        .position(|t| matches!(t.pos, POSTag::Noun | POSTag::ProperNoun))
435                    {
436                        dependencies.push(Dependency {
437                            head: i + 1 + noun_idx,
438                            dependent: i,
439                            relation: DependencyRelation::Determiner,
440                        });
441                    }
442                },
443                _ => {},
444            }
445        }
446
447        Ok(dependencies)
448    }
449
450    /// Extract noun phrases
451    pub fn extract_noun_phrases(&self, tokens: &[Token]) -> Result<Vec<NounPhrase>> {
452        let mut phrases = Vec::new();
453        let mut current_phrase: Vec<Token> = Vec::new();
454        let mut head_idx = 0;
455
456        for token in tokens {
457            match token.pos {
458                POSTag::Determiner | POSTag::Adjective => {
459                    // Start or continue noun phrase
460                    current_phrase.push(token.clone());
461                },
462                POSTag::Noun
463                | POSTag::ProperNoun
464                | POSTag::NounPlural
465                | POSTag::ProperNounPlural => {
466                    // Add noun to phrase
467                    head_idx = current_phrase.len();
468                    current_phrase.push(token.clone());
469                },
470                _ => {
471                    // End of noun phrase
472                    if !current_phrase.is_empty() {
473                        let text = current_phrase
474                            .iter()
475                            .map(|t| t.text.as_str())
476                            .collect::<Vec<_>>()
477                            .join(" ");
478
479                        phrases.push(NounPhrase {
480                            tokens: current_phrase.clone(),
481                            head_idx,
482                            text,
483                        });
484
485                        current_phrase.clear();
486                        head_idx = 0;
487                    }
488                },
489            }
490        }
491
492        // Add final phrase if exists
493        if !current_phrase.is_empty() {
494            let text = current_phrase
495                .iter()
496                .map(|t| t.text.as_str())
497                .collect::<Vec<_>>()
498                .join(" ");
499
500            phrases.push(NounPhrase {
501                tokens: current_phrase,
502                head_idx,
503                text,
504            });
505        }
506
507        Ok(phrases)
508    }
509
510    /// Segment text into sentences
511    pub fn segment_sentences(&self, text: &str) -> Vec<String> {
512        let sentence_regex = Regex::new(r"[.!?]+\s+").expect("static regex literal");
513        sentence_regex
514            .split(text)
515            .map(|s| s.trim().to_string())
516            .filter(|s| !s.is_empty())
517            .collect()
518    }
519
520    // Dictionary builders
521    fn build_noun_dict() -> HashMap<String, POSTag> {
522        let nouns = vec![
523            "time",
524            "person",
525            "year",
526            "way",
527            "day",
528            "thing",
529            "man",
530            "world",
531            "life",
532            "hand",
533            "part",
534            "child",
535            "eye",
536            "woman",
537            "place",
538            "work",
539            "week",
540            "case",
541            "point",
542            "government",
543            "company",
544            "number",
545            "group",
546            "problem",
547            "fact",
548        ];
549        nouns
550            .into_iter()
551            .map(|s| (s.to_string(), POSTag::Noun))
552            .collect()
553    }
554
555    fn build_verb_dict() -> HashMap<String, POSTag> {
556        let verbs = vec![
557            "be", "have", "do", "say", "get", "make", "go", "know", "take", "see", "come", "think",
558            "look", "want", "give", "use", "find", "tell", "ask", "work", "seem", "feel", "try",
559            "leave", "call",
560        ];
561        verbs
562            .into_iter()
563            .map(|s| (s.to_string(), POSTag::Verb))
564            .collect()
565    }
566
567    fn build_adjective_dict() -> HashMap<String, POSTag> {
568        let adjectives = vec![
569            "good",
570            "new",
571            "first",
572            "last",
573            "long",
574            "great",
575            "little",
576            "own",
577            "other",
578            "old",
579            "right",
580            "big",
581            "high",
582            "different",
583            "small",
584            "large",
585            "next",
586            "early",
587            "young",
588            "important",
589            "few",
590            "public",
591            "bad",
592            "same",
593            "able",
594        ];
595        adjectives
596            .into_iter()
597            .map(|s| (s.to_string(), POSTag::Adjective))
598            .collect()
599    }
600
601    fn build_adverb_dict() -> HashMap<String, POSTag> {
602        let adverbs = vec![
603            "not", "so", "out", "up", "now", "only", "just", "more", "also", "very", "well",
604            "back", "there", "even", "still", "too", "here", "then", "always", "never", "often",
605            "quite", "really", "almost", "again",
606        ];
607        adverbs
608            .into_iter()
609            .map(|s| (s.to_string(), POSTag::Adverb))
610            .collect()
611    }
612
613    fn build_preposition_dict() -> HashMap<String, POSTag> {
614        let prepositions = vec![
615            "of", "in", "to", "for", "with", "on", "at", "from", "by", "about", "into", "through",
616            "during", "before", "after", "above", "below", "between", "under", "since", "without",
617            "within", "along", "among", "across",
618        ];
619        prepositions
620            .into_iter()
621            .map(|s| (s.to_string(), POSTag::Preposition))
622            .collect()
623    }
624
625    fn build_determiner_dict() -> HashMap<String, POSTag> {
626        let determiners = vec![
627            "the", "a", "an", "this", "that", "these", "those", "my", "your", "his", "her", "its",
628            "our", "their", "all", "both", "each", "every", "some", "any", "no", "another", "such",
629            "what", "which",
630        ];
631        determiners
632            .into_iter()
633            .map(|s| (s.to_string(), POSTag::Determiner))
634            .collect()
635    }
636
637    fn build_pronoun_dict() -> HashMap<String, POSTag> {
638        let pronouns = vec![
639            "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them", "who",
640            "whom", "what", "which", "this", "that",
641        ];
642        pronouns
643            .into_iter()
644            .map(|s| (s.to_string(), POSTag::Pronoun))
645            .collect()
646    }
647
648    fn build_conjunction_dict() -> HashMap<String, POSTag> {
649        let conjunctions = vec![
650            "and", "or", "but", "nor", "yet", "so", "for", "because", "although", "though",
651            "while", "if", "unless", "until", "when", "where",
652        ];
653        conjunctions
654            .into_iter()
655            .map(|s| (s.to_string(), POSTag::Conjunction))
656            .collect()
657    }
658}
659
660#[cfg(test)]
661mod tests {
662    use super::*;
663
664    #[test]
665    fn test_pos_tagging() {
666        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
667        let text = "The good brown fox jumps over the lazy dog.";
668
669        let tokens = analyzer.pos_tag(text).unwrap();
670
671        assert!(!tokens.is_empty());
672
673        // Check some expected tags
674        assert_eq!(tokens[0].pos, POSTag::Determiner); // "The"
675        assert_eq!(tokens[1].pos, POSTag::Adjective); // "good" (in dictionary)
676        assert!(matches!(tokens[3].pos, POSTag::Noun | POSTag::ProperNoun)); // "fox"
677                                                                             // "jumps" ends with 's' but may be tagged as plural noun, so we check it's present
678        assert!(tokens.iter().any(|t| t.text == "jumps"));
679    }
680
681    #[test]
682    fn test_lemmatization() {
683        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
684
685        assert_eq!(analyzer.lemmatize("running", &POSTag::VerbGerund), "runn");
686        assert_eq!(analyzer.lemmatize("cats", &POSTag::NounPlural), "cat");
687        assert_eq!(analyzer.lemmatize("jumped", &POSTag::VerbPast), "jump");
688    }
689
690    #[test]
691    fn test_noun_phrase_extraction() {
692        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
693        let text = "The quick brown fox";
694
695        let tokens = analyzer.pos_tag(text).unwrap();
696        let phrases = analyzer.extract_noun_phrases(&tokens).unwrap();
697
698        assert_eq!(phrases.len(), 1);
699        assert_eq!(phrases[0].text, "The quick brown fox");
700    }
701
702    #[test]
703    fn test_dependency_parsing() {
704        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
705        let text = "The cat chased the mouse";
706
707        let tokens = analyzer.pos_tag(text).unwrap();
708        let deps = analyzer.parse_dependencies(&tokens).unwrap();
709
710        // Should have subject and object dependencies
711        assert!(!deps.is_empty());
712
713        // Find subject dependency
714        let has_subject = deps
715            .iter()
716            .any(|d| matches!(d.relation, DependencyRelation::Subject));
717        assert!(has_subject, "Should have subject dependency");
718    }
719
720    #[test]
721    fn test_sentence_segmentation() {
722        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
723        let text = "This is sentence one. This is sentence two! And sentence three?";
724
725        let sentences = analyzer.segment_sentences(text);
726
727        assert_eq!(sentences.len(), 3);
728        assert!(sentences[0].contains("sentence one"));
729        assert!(sentences[1].contains("sentence two"));
730        assert!(sentences[2].contains("sentence three"));
731    }
732
733    #[test]
734    fn test_tokenization() {
735        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
736        let text = "Hello, world!";
737
738        let tokens = analyzer.tokenize(text);
739
740        assert_eq!(tokens.len(), 4); // "Hello", ",", "world", "!"
741        assert_eq!(tokens[0].0, "Hello");
742        assert_eq!(tokens[1].0, ",");
743    }
744
745    #[test]
746    fn test_proper_noun_detection() {
747        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
748        let text = "John Smith lives in New York";
749
750        let tokens = analyzer.pos_tag(text).unwrap();
751
752        // Should detect proper nouns
753        let proper_nouns: Vec<_> = tokens
754            .iter()
755            .filter(|t| matches!(t.pos, POSTag::ProperNoun))
756            .collect();
757
758        assert!(!proper_nouns.is_empty());
759    }
760}