Skip to main content

hs_predict/llm/
prompt.rs

1//! Prompt builder for LLM-based HS code classification.
2//!
3//! [`PromptBuilder`] converts a [`ProductDescription`](crate::types::ProductDescription)
4//! into a ready-to-send [`LlmPrompt`](super::LlmPrompt) in English or Japanese.
5
6use crate::types::{Language, PhysicalForm, ProductDescription};
7use super::LlmPrompt;
8
9// ─────────────────────────────────────────────────────────────────────────────
10// PromptBuilder
11// ─────────────────────────────────────────────────────────────────────────────
12
13// ─────────────────────────────────────────────────────────────────────────────
14// Helpers
15// ─────────────────────────────────────────────────────────────────────────────
16
17/// Maximum length (chars) of user-supplied `additional_context` text that is
18/// injected into LLM prompts.  Content beyond this limit is silently truncated.
19const MAX_CONTEXT_CHARS: usize = 500;
20
21/// Sanitize free-text supplied by the user before it is injected into an LLM
22/// prompt.
23///
24/// - Strips ASCII control characters (newlines, carriage returns, tabs, etc.)
25///   to prevent prompt injection via line-break-based structure manipulation.
26/// - Truncates to [`MAX_CONTEXT_CHARS`] characters.
27fn sanitize_context(ctx: &str) -> String {
28    let cleaned: String = ctx
29        .chars()
30        .filter(|c| !c.is_control())
31        .take(MAX_CONTEXT_CHARS)
32        .collect();
33    cleaned
34}
35
36/// Builds the system and user prompt texts from a [`ProductDescription`].
37///
38/// # Example
39/// ```rust
40/// # #[cfg(feature = "llm")]
41/// # {
42/// use hs_predict::llm::PromptBuilder;
43/// use hs_predict::types::{ProductDescription, SubstanceIdentifier, PhysicalForm, Language};
44///
45/// let product = ProductDescription {
46///     identifier: SubstanceIdentifier::from_cas("64-19-7"),
47///     physical_form: Some(PhysicalForm::Liquid),
48///     purity_pct: Some(99.8),
49///     purity_type: None,
50///     mixture_components: None,
51///     intended_use: None,
52///     additional_context: None,
53/// };
54///
55/// let prompt = PromptBuilder::new().build(&product);
56/// assert!(prompt.system_text.contains("HS 2022"));
57/// assert!(prompt.user_text.contains("64-19-7"));
58/// # }
59/// ```
60#[derive(Debug, Clone, Default)]
61pub struct PromptBuilder {
62    language: Language,
63}
64
65impl PromptBuilder {
66    /// Create a new builder that emits English prompts.
67    pub fn new() -> Self {
68        Self { language: Language::En }
69    }
70
71    /// Set the output language.
72    pub fn with_language(mut self, language: Language) -> Self {
73        self.language = language;
74        self
75    }
76
77    /// Build the [`LlmPrompt`] from the given product description.
78    pub fn build(&self, product: &ProductDescription) -> LlmPrompt {
79        let smiles_analysis = product
80            .identifier
81            .smiles
82            .as_deref()
83            .and_then(crate::smiles::classify_smiles);
84
85        let system_text = match self.language {
86            Language::En => self.system_text_en(),
87            Language::Ja => self.system_text_ja(),
88        };
89
90        let user_text = match self.language {
91            Language::En => self.user_text_en(product, smiles_analysis.as_ref()),
92            Language::Ja => self.user_text_ja(product, smiles_analysis.as_ref()),
93        };
94
95        LlmPrompt {
96            system_text,
97            user_text,
98            smiles_analysis,
99        }
100    }
101
102    // ─── System prompts ───────────────────────────────────────────────
103
104    fn system_text_en(&self) -> String {
105        r#"You are an expert customs classification specialist with deep knowledge of the
106Harmonized System (HS) 2022 nomenclature, particularly Chapters 28 and 29 for
107chemical products.
108
109Your task is to assign a six-digit HS 2022 code to the chemical product described
110in the user message.
111
112## Output format
113
114Respond with **only** a JSON object — no prose, no markdown:
115
116```json
117{
118  "hs_code":    "<6 ASCII digits, no dots>",
119  "confidence": <float 0.0–1.0>,
120  "rationale":  "<1–3 sentences explaining the classification>",
121  "alternatives": [
122    { "hs_code": "<6 digits>", "confidence": <float>, "reason": "<brief>" }
123  ]
124}
125```
126
127`alternatives` may be an empty array `[]`.
128
129## Confidence guide
130
131| Score | Meaning |
132|-------|---------|
133| ≥ 0.90 | Certain of the 6-digit sub-heading |
134| ≥ 0.70 | Certain of the 4-digit heading, sub-heading uncertain |
135| ≥ 0.50 | Chapter correct, heading uncertain |
136| < 0.50 | Significant uncertainty — classify to the most likely heading |
137
138## Rules
139
140- Use HS 2022 edition.
141- If a SMILES-derived heading hint is provided, treat it as a cross-check, not
142  authoritative — rule 1 of HS Explanatory Notes takes precedence over chemical
143  structure alone.
144- Always verify Chapter Notes and Section Notes before finalising.
145- For mixtures, classify by the component that gives the mixture its essential
146  character (GRI 3b) unless a specific mixture heading applies.
147"#.to_string()
148    }
149
150    fn system_text_ja(&self) -> String {
151        r#"あなたは輸出入通関の専門家であり、HS 2022 品目表(特に第28類・第29類の化学品)に
152精通しています。
153
154ユーザーメッセージに記載された化学品に対して、6桁の HS 2022 コードを付与してください。
155
156## 出力形式
157
158**JSON オブジェクトのみ**を返答してください(文章・マークダウン不要):
159
160```json
161{
162  "hs_code":    "<6桁の数字、ドットなし>",
163  "confidence": <0.0〜1.0 の小数>,
164  "rationale":  "<分類根拠を1〜3文で>",
165  "alternatives": [
166    { "hs_code": "<6桁>", "confidence": <小数>, "reason": "<簡潔な理由>" }
167  ]
168}
169```
170
171`alternatives` は空配列 `[]` でも可。
172
173## 信頼度の目安
174
175| スコア | 意味 |
176|--------|------|
177| ≥ 0.90 | 6桁の細分まで確実 |
178| ≥ 0.70 | 4桁の号まで確実、細分は不確実 |
179| ≥ 0.50 | 類は正しいが号が不確実 |
180| < 0.50 | 大きな不確実性あり — 最も可能性の高い号に分類 |
181
182## ルール
183
184- HS 2022年版を使用すること。
185- SMILES由来のヘッディングヒントが提供された場合は参考情報として扱い、
186  HS解説書の通則1を優先すること。
187- 分類確定前に類注および部注を確認すること。
188- 混合物の場合、特定の混合物号がない限り、本質的特性を与える成分で分類(通則3(b))。
189"#.to_string()
190    }
191
192    // ─── User prompts ─────────────────────────────────────────────────
193
194    fn user_text_en(
195        &self,
196        product: &ProductDescription,
197        smiles_analysis: Option<&crate::smiles::SmilesClassification>,
198    ) -> String {
199        let mut parts: Vec<String> = Vec::new();
200
201        parts.push("## Product to classify".to_string());
202        parts.push(String::new());
203
204        // Identifiers
205        let id = &product.identifier;
206        if let Some(ref cas) = id.cas {
207            parts.push(format!("- **CAS**: {}", cas));
208        }
209        if let Some(ref iupac) = id.iupac_name {
210            parts.push(format!("- **IUPAC name**: {}", iupac));
211        }
212        if let Some(ref smiles) = id.smiles {
213            parts.push(format!("- **SMILES**: {}", smiles));
214        }
215        if let Some(ref inchi) = id.inchi {
216            parts.push(format!("- **InChI**: {}", inchi));
217        }
218        if let Some(ref inchikey) = id.inchi_key {
219            parts.push(format!("- **InChIKey**: {}", inchikey));
220        }
221
222        // Physical form
223        if let Some(ref form) = product.physical_form {
224            parts.push(format!("- **Physical form**: {}", physical_form_en(form)));
225        }
226
227        // Purity
228        if let Some(purity) = product.purity_pct {
229            parts.push(format!("- **Purity**: {:.1}%", purity));
230        }
231
232        // Intended use
233        if let Some(ref use_) = product.intended_use {
234            parts.push(format!("- **Intended use**: {:?}", use_));
235        }
236
237        // Mixture components
238        if let Some(ref comps) = product.mixture_components {
239            parts.push("- **Mixture components**:".to_string());
240            for c in comps {
241                let frac = c
242                    .weight_fraction_pct
243                    .map(|f| format!(" ({:.1}% w/w)", f))
244                    .unwrap_or_default();
245                let name = c.substance.cas.as_deref()
246                    .or(c.substance.iupac_name.as_deref())
247                    .unwrap_or("unknown");
248                parts.push(format!("  - {}{}", name, frac));
249            }
250        }
251
252        // Additional context — sanitized to prevent prompt-injection via
253        // control characters or excessively long free-text input.
254        if let Some(ref ctx) = product.additional_context {
255            parts.push(format!("- **Additional context**: {}", sanitize_context(ctx)));
256        }
257
258        // SMILES analysis hint
259        if let Some(analysis) = smiles_analysis {
260            parts.push(String::new());
261            parts.push("## SMILES pre-analysis hint".to_string());
262            parts.push(String::new());
263            parts.push(format!("- **Organic class**: {:?}", analysis.organic_class));
264            if !analysis.functional_groups.is_empty() {
265                let groups: Vec<&str> = analysis
266                    .functional_groups
267                    .iter()
268                    .map(|g| g.label())
269                    .collect();
270                parts.push(format!("- **Functional groups detected**: {}", groups.join(", ")));
271            }
272            let hint = &analysis.heading_hint;
273            if let Some(heading) = hint.heading {
274                parts.push(format!(
275                    "- **Heading hint**: {}.{:02} ({}, confidence {:.2})",
276                    heading / 100,
277                    heading % 100,
278                    hint.rationale,
279                    hint.confidence
280                ));
281            } else {
282                parts.push(format!(
283                    "- **Chapter hint**: Ch.{:02} (confidence {:.2})",
284                    hint.chapter, hint.confidence
285                ));
286            }
287            parts.push(String::new());
288            parts.push(
289                "_This hint is derived from SMILES pattern matching and is provided for \
290                 cross-checking only. Apply the HS Explanatory Notes authoritatively._"
291                    .to_string(),
292            );
293        }
294
295        parts.join("\n")
296    }
297
298    fn user_text_ja(
299        &self,
300        product: &ProductDescription,
301        smiles_analysis: Option<&crate::smiles::SmilesClassification>,
302    ) -> String {
303        let mut parts: Vec<String> = Vec::new();
304
305        parts.push("## 分類対象品目".to_string());
306        parts.push(String::new());
307
308        let id = &product.identifier;
309        if let Some(ref cas) = id.cas {
310            parts.push(format!("- **CAS番号**: {}", cas));
311        }
312        if let Some(ref iupac) = id.iupac_name {
313            parts.push(format!("- **IUPAC名**: {}", iupac));
314        }
315        if let Some(ref smiles) = id.smiles {
316            parts.push(format!("- **SMILES**: {}", smiles));
317        }
318        if let Some(ref inchi) = id.inchi {
319            parts.push(format!("- **InChI**: {}", inchi));
320        }
321        if let Some(ref inchikey) = id.inchi_key {
322            parts.push(format!("- **InChIKey**: {}", inchikey));
323        }
324
325        if let Some(ref form) = product.physical_form {
326            parts.push(format!("- **物理的形状**: {}", physical_form_ja(form)));
327        }
328
329        if let Some(purity) = product.purity_pct {
330            parts.push(format!("- **純度**: {:.1}%", purity));
331        }
332
333        if let Some(ref use_) = product.intended_use {
334            parts.push(format!("- **用途**: {:?}", use_));
335        }
336
337        if let Some(ref comps) = product.mixture_components {
338            parts.push("- **混合成分**:".to_string());
339            for c in comps {
340                let frac = c
341                    .weight_fraction_pct
342                    .map(|f| format!(" ({:.1}% w/w)", f))
343                    .unwrap_or_default();
344                let name = c.substance.cas.as_deref()
345                    .or(c.substance.iupac_name.as_deref())
346                    .unwrap_or("不明");
347                parts.push(format!("  - {}{}", name, frac));
348            }
349        }
350
351        // `additional_context` — sanitized to prevent prompt-injection.
352        if let Some(ref ctx) = product.additional_context {
353            parts.push(format!("- **補足情報**: {}", sanitize_context(ctx)));
354        }
355
356        if let Some(analysis) = smiles_analysis {
357            parts.push(String::new());
358            parts.push("## SMILES 事前解析ヒント".to_string());
359            parts.push(String::new());
360            parts.push(format!("- **有機/無機区分**: {:?}", analysis.organic_class));
361            if !analysis.functional_groups.is_empty() {
362                let groups: Vec<&str> = analysis
363                    .functional_groups
364                    .iter()
365                    .map(|g| g.label())
366                    .collect();
367                parts.push(format!("- **検出官能基**: {}", groups.join("、")));
368            }
369            let hint = &analysis.heading_hint;
370            if let Some(heading) = hint.heading {
371                parts.push(format!(
372                    "- **号ヒント**: {}.{:02}({}、信頼度 {:.2})",
373                    heading / 100,
374                    heading % 100,
375                    hint.rationale,
376                    hint.confidence
377                ));
378            } else {
379                parts.push(format!(
380                    "- **類ヒント**: 第{:02}類(信頼度 {:.2})",
381                    hint.chapter, hint.confidence
382                ));
383            }
384            parts.push(String::new());
385            parts.push(
386                "_このヒントはSMILESパターンマッチングによるもので、参考情報です。\
387                 HS解説書を正式な根拠として適用してください。_"
388                    .to_string(),
389            );
390        }
391
392        parts.join("\n")
393    }
394}
395
396// ─────────────────────────────────────────────────────────────────────────────
397// Helpers
398// ─────────────────────────────────────────────────────────────────────────────
399
400fn physical_form_en(form: &PhysicalForm) -> &'static str {
401    match form {
402        PhysicalForm::Solid => "Solid",
403        PhysicalForm::Powder { .. } => "Powder",
404        PhysicalForm::Granules => "Granules",
405        PhysicalForm::Liquid => "Liquid",
406        PhysicalForm::Solution { .. } => "Solution",
407        PhysicalForm::Gas => "Gas",
408        PhysicalForm::Foil { .. } => "Foil",
409        PhysicalForm::Ingot => "Ingot",
410        PhysicalForm::Unknown => "Unknown",
411    }
412}
413
414fn physical_form_ja(form: &PhysicalForm) -> &'static str {
415    match form {
416        PhysicalForm::Solid => "固体",
417        PhysicalForm::Powder { .. } => "粉末",
418        PhysicalForm::Granules => "顆粒",
419        PhysicalForm::Liquid => "液体",
420        PhysicalForm::Solution { .. } => "溶液",
421        PhysicalForm::Gas => "気体",
422        PhysicalForm::Foil { .. } => "箔",
423        PhysicalForm::Ingot => "インゴット",
424        PhysicalForm::Unknown => "不明",
425    }
426}
427
428// ─────────────────────────────────────────────────────────────────────────────
429// Tests
430// ─────────────────────────────────────────────────────────────────────────────
431
432#[cfg(test)]
433mod tests {
434    use super::*;
435    use crate::types::{ProductDescription, SubstanceIdentifier};
436
437    fn acetic_acid() -> ProductDescription {
438        ProductDescription {
439            identifier: SubstanceIdentifier {
440                cas: Some("64-19-7".to_string()),
441                iupac_name: Some("acetic acid".to_string()),
442                smiles: Some("CC(O)=O".to_string()),
443                inchi: None,
444                inchi_key: None,
445                cid: None,
446            },
447            physical_form: Some(PhysicalForm::Liquid),
448            purity_pct: Some(99.5),
449            purity_type: None,
450            mixture_components: None,
451            intended_use: None,
452            additional_context: None,
453        }
454    }
455
456    #[test]
457    fn en_system_prompt_contains_hs_2022() {
458        let p = PromptBuilder::new().build(&acetic_acid());
459        assert!(p.system_text.contains("HS 2022"));
460    }
461
462    #[test]
463    fn en_user_text_contains_cas() {
464        let p = PromptBuilder::new().build(&acetic_acid());
465        assert!(p.user_text.contains("64-19-7"));
466    }
467
468    #[test]
469    fn en_user_text_contains_purity() {
470        let p = PromptBuilder::new().build(&acetic_acid());
471        assert!(p.user_text.contains("99.5"));
472    }
473
474    #[test]
475    fn en_user_text_contains_smiles_hint() {
476        let p = PromptBuilder::new().build(&acetic_acid());
477        // acetic acid SMILES → carboxylic acid → heading 29.15
478        assert!(p.user_text.contains("Heading hint") || p.user_text.contains("heading hint")
479            || p.user_text.contains("SMILES pre-analysis"));
480    }
481
482    #[test]
483    fn smiles_analysis_populated_when_smiles_present() {
484        let p = PromptBuilder::new().build(&acetic_acid());
485        assert!(p.smiles_analysis.is_some());
486    }
487
488    #[test]
489    fn smiles_analysis_none_when_no_smiles() {
490        let product = ProductDescription {
491            identifier: SubstanceIdentifier::from_cas("64-19-7"),
492            physical_form: None,
493            purity_pct: None,
494            purity_type: None,
495            mixture_components: None,
496            intended_use: None,
497            additional_context: None,
498        };
499        let p = PromptBuilder::new().build(&product);
500        assert!(p.smiles_analysis.is_none());
501    }
502
503    #[test]
504    fn ja_system_prompt_contains_hs_2022_ja() {
505        let p = PromptBuilder::new()
506            .with_language(Language::Ja)
507            .build(&acetic_acid());
508        assert!(p.system_text.contains("HS 2022"));
509    }
510
511    #[test]
512    fn ja_user_text_contains_cas() {
513        let p = PromptBuilder::new()
514            .with_language(Language::Ja)
515            .build(&acetic_acid());
516        assert!(p.user_text.contains("64-19-7"));
517    }
518
519    #[test]
520    fn mixture_components_listed() {
521        use crate::types::MixtureComponent;
522        let product = ProductDescription {
523            identifier: SubstanceIdentifier::from_cas("7732-18-5"),
524            physical_form: Some(PhysicalForm::Solution {
525                concentration_pct_ww: Some(30.0),
526                solvent: None,
527            }),
528            purity_pct: None,
529            purity_type: None,
530            mixture_components: Some(vec![
531                MixtureComponent {
532                    substance: SubstanceIdentifier::from_cas("1310-73-2"),
533                    weight_fraction_pct: Some(30.0),
534                    volume_fraction_pct: None,
535                    is_solvent: false,
536                },
537            ]),
538            intended_use: None,
539            additional_context: None,
540        };
541        let p = PromptBuilder::new().build(&product);
542        assert!(p.user_text.contains("1310-73-2"));
543        assert!(p.user_text.contains("30.0"));
544    }
545}