patent 0.1.0

A prior-art search for your code ideas — has this dev tool already been shipped?
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
//! Source registry: one implementor per ecosystem, fanned out concurrently.
//!
//! Sources are selected based on the query — a Rust query searches crates.io,
//! a Python query searches PyPI, etc. GitHub is always included. When no
//! language is detected, the three largest registries (npm, PyPI, crates.io)
//! are used as a broad fallback.

use std::collections::HashSet;
use std::time::Duration;

use futures::future::join_all;

use crate::model::{Match, Query};
use crate::Result;

pub mod crates_io;
pub mod docker_hub;
pub mod github;
pub mod go;
pub mod hacker_news;
pub mod maven;
pub mod npm;
pub mod nuget;
pub mod pypi;
pub mod rubygems;
pub mod vscode;

/// One searchable ecosystem (a registry, a forge, a community index).
#[async_trait::async_trait]
pub trait SourceAdapter: Send + Sync {
    /// Stable identifier, used in the transparency line.
    fn id(&self) -> crate::model::Source;

    /// Search this source for prior art matching `query`.
    async fn search(&self, query: &Query) -> Result<Vec<Match>>;
}

use crate::model::Source as S;

fn http_client() -> reqwest::Client {
    reqwest::Client::builder()
        .timeout(Duration::from_secs(10))
        .connect_timeout(Duration::from_secs(5))
        .user_agent(concat!(
            "patent/",
            env!("CARGO_PKG_VERSION"),
            " (prior-art search)"
        ))
        .build()
        .expect("failed to build HTTP client")
}

fn idea_contains(idea: &str, terms: &[&str]) -> bool {
    let lower = idea.to_lowercase();
    let bytes = lower.as_bytes();
    terms.iter().any(|t| {
        // Check EVERY occurrence, not just the first: a short keyword like "go"
        // may first appear inside a larger word ("django") and then again as a
        // standalone word — only the standalone one should count.
        lower.match_indices(t).any(|(pos, _)| {
            let before = pos == 0 || !bytes[pos - 1].is_ascii_alphanumeric();
            let after_pos = pos + t.len();
            let after = after_pos >= bytes.len() || !bytes[after_pos].is_ascii_alphanumeric();
            before && after
        })
    })
}

fn add(set: &mut HashSet<S>, sources: &[S]) {
    set.extend(sources);
}

fn detect_sources(idea: &str) -> HashSet<S> {
    let mut s = HashSet::new();

    // GitHub and Hacker News are always included: GitHub is the cross-language
    // home of source code, and Hacker News is the cross-language home of the
    // "Show HN" launch / discussion that often predates a registry release.
    // Both are language-agnostic, so they apply to every query.
    s.insert(S::GitHub);
    s.insert(S::HackerNews);

    // ── Explicit language / ecosystem mentions ──────────────────────────
    if idea_contains(idea, &["rust", "crate", "cargo"]) {
        s.insert(S::CratesIo);
    }
    if idea_contains(
        idea,
        &["npm", "node", "javascript", "typescript", "deno", "bun"],
    ) {
        s.insert(S::Npm);
    }
    if idea_contains(
        idea,
        &["python", "pip", "django", "flask", "pytorch", "pandas"],
    ) {
        s.insert(S::PyPI);
    }
    if idea_contains(idea, &["go", "golang", "goroutine"]) {
        s.insert(S::Go);
    }
    if idea_contains(
        idea,
        &["java", "kotlin", "spring", "maven", "gradle", "scala"],
    ) {
        s.insert(S::Maven);
    }
    if idea_contains(idea, &["ruby", "rails", "sinatra", "gem"]) {
        s.insert(S::RubyGems);
    }
    if idea_contains(
        idea,
        &["c#", ".net", "csharp", "dotnet", "nuget", "blazor", "unity"],
    ) {
        s.insert(S::NuGet);
    }

    // ── Domain inference (no language named, but the problem implies one) ─
    if idea_contains(
        idea,
        &[
            "ai",
            "llm",
            "machine learning",
            "deep learning",
            "neural",
            "model training",
            "inference",
            "embedding",
            "nlp",
            "computer vision",
            "data science",
            "data pipeline",
        ],
    ) {
        add(&mut s, &[S::PyPI, S::Npm]);
    }
    // CLI tools span every ecosystem, so a generic "cli" mention casts the net
    // across the dominant registries rather than just Rust/Go — otherwise the
    // flagship "kill the process on a port" demo would never search npm, where
    // fkill-cli / kill-port actually live.
    if idea_contains(idea, &["cli", "command line", "terminal tool", "shell"]) {
        add(&mut s, &[S::CratesIo, S::Go, S::Npm, S::PyPI]);
    }
    if idea_contains(
        idea,
        &[
            "frontend",
            "react",
            "vue",
            "angular",
            "svelte",
            "browser",
            "css",
            "ui component",
            "web component",
            "spa",
        ],
    ) {
        s.insert(S::Npm);
    }
    if idea_contains(
        idea,
        &[
            "api",
            "backend",
            "rest",
            "graphql",
            "microservice",
            "web server",
        ],
    ) {
        add(&mut s, &[S::Npm, S::PyPI, S::Go]);
    }
    if idea_contains(
        idea,
        &[
            "mobile",
            "ios",
            "android",
            "react native",
            "flutter",
            "swift",
            "swiftui",
        ],
    ) {
        add(&mut s, &[S::Npm, S::Maven]);
    }
    if idea_contains(
        idea,
        &[
            "game",
            "graphics",
            "rendering",
            "opengl",
            "vulkan",
            "bevy",
            "godot",
        ],
    ) {
        add(&mut s, &[S::CratesIo, S::NuGet]);
    }
    if idea_contains(idea, &["embedded", "firmware", "microcontroller", "rtos"]) {
        s.insert(S::CratesIo);
    }
    if idea_contains(
        idea,
        &[
            "docker",
            "container",
            "kubernetes",
            "k8s",
            "helm",
            "deploy",
            "infrastructure",
        ],
    ) {
        add(&mut s, &[S::DockerHub, S::Go]);
    }
    if idea_contains(idea, &["vscode", "extension", "plugin", "ide", "editor"]) {
        add(&mut s, &[S::VsCodeMarketplace, S::Npm]);
    }

    // ── Fallback: no signal at all → broad sweep ────────────────────────
    // The always-on GitHub + Hacker News aren't enough on their own; if no
    // language/domain branch matched, add the 3 biggest registries.
    const ALWAYS_ON: usize = 2; // GitHub + Hacker News
    if s.len() <= ALWAYS_ON {
        add(&mut s, &[S::Npm, S::PyPI, S::CratesIo]);
    }

    s
}

fn build_source(id: S, client: reqwest::Client) -> Box<dyn SourceAdapter> {
    match id {
        S::CratesIo => Box::new(crates_io::CratesIo::new(client)),
        S::GitHub => Box::new(github::GitHub::new(client)),
        S::Npm => Box::new(npm::Npm::new(client)),
        S::PyPI => Box::new(pypi::PyPI::new(client)),
        S::HackerNews => Box::new(hacker_news::HackerNews::new(client)),
        S::Go => Box::new(go::GoPkgDev::new(client)),
        S::Maven => Box::new(maven::Maven::new(client)),
        S::RubyGems => Box::new(rubygems::RubyGems::new(client)),
        S::DockerHub => Box::new(docker_hub::DockerHub::new(client)),
        S::VsCodeMarketplace => Box::new(vscode::VsCodeMarketplace::new(client)),
        S::NuGet => Box::new(nuget::NuGet::new(client)),
    }
}

/// Pick sources based on what the query is about.
fn sources_for(query: &Query) -> Vec<Box<dyn SourceAdapter>> {
    let client = http_client();
    let ids = detect_sources(&query.idea);
    ids.into_iter()
        .map(|id| build_source(id, client.clone()))
        .collect()
}

/// The outcome of a fan-out: deduped matches, the sources that responded, and
/// the selected sources that failed (so reduced coverage can be surfaced).
pub struct SearchOutcome {
    pub matches: Vec<Match>,
    pub reached: Vec<crate::model::Source>,
    pub failed: Vec<crate::model::Source>,
}

/// Fan out to selected sources concurrently, dropping the ones that fail.
pub async fn search_all(query: &Query) -> SearchOutcome {
    search_sources(&sources_for(query), query).await
}

/// Run `query` against `sources` concurrently, skipping any that error, and
/// dedup the combined results. Returns the deduped matches, which sources
/// responded successfully, and which were attempted but failed. Exposed for
/// testing the fan-out in isolation.
pub async fn search_sources(sources: &[Box<dyn SourceAdapter>], query: &Query) -> SearchOutcome {
    let results = join_all(sources.iter().map(|s| {
        let id = s.id();
        async move {
            let first = s.search(query).await;
            if first.is_ok() {
                return (id, first);
            }
            tokio::time::sleep(Duration::from_millis(800)).await;
            (id, s.search(query).await)
        }
    }))
    .await;

    let mut reached = Vec::new();
    let mut failed = Vec::new();
    let mut all = Vec::new();
    for (id, result) in results {
        match result {
            Ok(matches) => {
                reached.push(id);
                all.extend(matches);
            }
            Err(e) => {
                eprintln!("{id} not reached: {e}");
                failed.push(id);
            }
        }
    }
    SearchOutcome {
        matches: dedup(all),
        reached,
        failed,
    }
}

/// Remove duplicate matches by URL, keeping the first occurrence and preserving
/// order. URL is a match's canonical identity across sources.
pub fn dedup(matches: Vec<Match>) -> Vec<Match> {
    let mut seen = HashSet::new();
    matches
        .into_iter()
        .filter(|m| seen.insert(m.url.clone()))
        .collect()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn idea_contains_respects_word_boundaries() {
        assert!(idea_contains("a fast async runtime", &["async"]));
        // Substrings inside larger words must not match.
        assert!(!idea_contains("rainbow trains", &["ai"]));
        assert!(!idea_contains("googol", &["go"]));
        assert!(!idea_contains("django framework", &["go"]));
    }

    #[test]
    fn idea_contains_checks_all_occurrences_not_just_the_first() {
        // A non-boundary substring earlier in the string must not mask a later
        // standalone occurrence of the keyword.
        assert!(idea_contains(
            "a tool for cargo packages written in go",
            &["go"]
        ));
        assert!(idea_contains("email summarizer that uses ai", &["ai"]));
        assert!(idea_contains("a good way to go fast", &["go"]));
    }

    #[test]
    fn github_and_hacker_news_are_always_selected() {
        // Whatever the idea, the two language-agnostic indexes are present so
        // they are never falsely advertised but unreachable.
        for idea in ["a ruby gem for parsing csv", "asdf qwer zxcv", "rust crate"] {
            let s = detect_sources(idea);
            assert!(s.contains(&S::GitHub), "GitHub missing for {idea:?}");
            assert!(
                s.contains(&S::HackerNews),
                "Hacker News missing for {idea:?}"
            );
        }
    }

    #[test]
    fn every_built_source_is_reachable_from_some_idea() {
        // Guards against re-introducing a "marketed but never selected" source:
        // each variant build_source can construct must be selectable.
        let ideas = [
            "rust crate for embedded firmware",
            "a python pandas data pipeline",
            "a typescript react frontend component",
            "a golang microservice",
            "a java spring boot service",
            "a ruby on rails gem",
            "a c# dotnet unity game",
            "a docker container for kubernetes",
            "a vscode extension for editors",
            "anything at all with no signal",
        ];
        let mut seen: HashSet<S> = HashSet::new();
        for idea in ideas {
            seen.extend(detect_sources(idea));
        }
        for variant in [
            S::CratesIo,
            S::GitHub,
            S::Npm,
            S::PyPI,
            S::HackerNews,
            S::Go,
            S::Maven,
            S::RubyGems,
            S::DockerHub,
            S::VsCodeMarketplace,
            S::NuGet,
        ] {
            assert!(
                seen.contains(&variant),
                "{variant} is built but never selected by detect_sources"
            );
        }
    }

    #[test]
    fn language_mentions_select_their_registry() {
        assert!(detect_sources("a rust crate for parsing").contains(&S::CratesIo));
        assert!(detect_sources("a python library for parsing").contains(&S::PyPI));
        assert!(detect_sources("a docker image for caching").contains(&S::DockerHub));
        assert!(detect_sources("a ruby gem for parsing").contains(&S::RubyGems));
    }

    #[test]
    fn go_and_ai_match_natural_phrasings() {
        // Regression: trailing-space keywords ("go ", "ai ") used to be
        // impossible to match. These phrasings deliberately avoid the "cli"
        // branch (which would add Go on its own) so they isolate the keyword.
        assert!(detect_sources("a fast Go library for parsing json").contains(&S::Go));
        assert!(detect_sources("a library that uses AI to summarize text").contains(&S::PyPI));
        // And the keyword must still win when a non-boundary substring precedes
        // the standalone word (the first-occurrence regression).
        assert!(detect_sources("a cargo workspace tool also written in go").contains(&S::Go));
    }

    #[test]
    fn port_killer_demo_searches_npm() {
        // The flagship README example must reach npm, where fkill-cli /
        // kill-port live — otherwise the headline demo finds no prior art.
        for idea in [
            "interactive cli to kill whatever's on a port",
            "CLI tool that kills whatever's on a port",
        ] {
            let s = detect_sources(idea);
            assert!(s.contains(&S::Npm), "npm missing for {idea:?}: {s:?}");
        }
    }

    #[test]
    fn no_signal_falls_back_to_broad_sweep() {
        let s = detect_sources("asdf qwer zxcv hjkl");
        assert!(s.contains(&S::Npm));
        assert!(s.contains(&S::PyPI));
        assert!(s.contains(&S::CratesIo));
    }
}