fast_html2md 0.0.61

A fast html2md crate for rust
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
extern crate spectral;

#[cfg(feature = "scraper")]
use indoc::indoc;
#[cfg(feature = "scraper")]
use spectral::prelude::*;
#[cfg(feature = "scraper")]
use std::collections::HashMap;

use std::fs::File;
use std::io::prelude::*;
use url::Url;

#[test]
#[ignore]
#[cfg(feature = "scraper")]
fn test_marcfs() {
    let mut html = String::new();
    let mut html_file = File::open("../test-samples/marcfs-readme.html").unwrap();
    html_file
        .read_to_string(&mut html)
        .expect("File must be readable");
    let result = html2md::parse_html(&html, false);
    assert!(!result.is_empty());
}

#[test]
#[cfg(feature = "scraper")]
fn test_real_world_wiki() -> Result<(), Box<dyn std::error::Error>> {
    use std::error::Error;
    use std::fs::{self, File};
    use std::io::{self, Read};
    use std::path::Path;

    let paths = fs::read_dir("../test-samples/wiki")?;

    fn run_parse(path: &Path) -> Result<(), Box<dyn Error>> {
        let mut html = String::new();
        let mut html_file = File::open(path)?;
        html_file.read_to_string(&mut html)?;

        let result = html2md::parse_html(&html, false);

        if result.is_empty() {
            Err(Box::new(io::Error::new(
                io::ErrorKind::Other,
                "Result is empty",
            )))
        } else {
            Ok(())
        }
    }

    for entry in paths {
        let path = entry?.path();

        if path.is_file() {
            match run_parse(&path) {
                Ok(_) => assert!(true),
                Err(_e) => assert!(false),
            }
        }
    }

    Ok(())
}

#[test]
fn test_real_world_wiki_rewriter() -> Result<(), Box<dyn std::error::Error>> {
    use std::error::Error;
    use std::fs::{self, File};
    use std::io::{self, Read};
    use std::path::Path;

    let paths = fs::read_dir("../test-samples/wiki")?;

    fn run_parse(path: &Path) -> Result<(), Box<dyn Error>> {
        let mut html = String::new();
        let mut html_file = File::open(path)?;
        html_file.read_to_string(&mut html)?;

        let result = html2md::rewrite_html(&html, false);

        if result.is_empty() {
            Err(Box::new(io::Error::new(
                io::ErrorKind::Other,
                "Result is empty",
            )))
        } else {
            Ok(())
        }
    }

    for entry in paths {
        let path = entry?.path();

        if path.is_file() {
            match run_parse(&path) {
                Ok(_) => assert!(true),
                Err(_e) => assert!(false),
            }
        }
    }

    Ok(())
}

#[tokio::test]
#[cfg(all(feature = "stream", feature = "rewriter"))]
async fn test_real_world_wiki_async() -> Result<(), Box<dyn std::error::Error>> {
    use std::error::Error;
    use std::fs::{self, File};
    use std::io::{self, Read};
    use std::path::Path;

    let paths = fs::read_dir("../test-samples/wiki")?;

    async fn run_parse(path: &Path) -> Result<(), Box<dyn Error>> {
        let mut html = String::new();
        let mut html_file = File::open(path)?;
        html_file.read_to_string(&mut html)?;

        let result = html2md::rewrite_html_streaming(&html, false).await;

        if result.is_empty() {
            Err(Box::new(io::Error::new(
                io::ErrorKind::Other,
                "Result is empty",
            )))
        } else {
            Ok(())
        }
    }

    for entry in paths {
        let path = entry?.path();

        if path.is_file() {
            match run_parse(&path).await {
                Ok(_) => assert!(true),
                Err(_e) => assert!(false),
            }
        }
    }

    Ok(())
}

#[test]
#[ignore]
#[cfg(feature = "scraper")]
fn test_real_world_ja() {
    let mut html = String::new();
    let mut html_file: File = File::open("../test-samples/real-world-ja-1.html").unwrap();
    html_file
        .read_to_string(&mut html)
        .expect("File must be readable");
    let result = html2md::parse_html(&html, false);
    assert!(!result.is_empty());
}

#[test]
#[ignore]
#[cfg(feature = "scraper")]
fn test_cheatsheet() {
    let mut html = String::new();
    let mut md = String::new();
    let mut html_file = File::open("../test-samples/markdown-cheatsheet.html").unwrap();
    let mut md_file = File::open("../test-samples/markdown-cheatsheet.md").unwrap();
    html_file
        .read_to_string(&mut html)
        .expect("File must be readable");
    md_file
        .read_to_string(&mut md)
        .expect("File must be readable");
    let md_parsed = html2md::parse_html(&html, false);
    assert!(!md_parsed.is_empty());
}

/// newlines after list shouldn't be converted into text of the last list element
#[test]
#[cfg(feature = "scraper")]
fn test_list_newlines() {
    let mut html = String::new();
    let mut html_file = File::open("../test-samples/dybr-bug-with-list-newlines.html").unwrap();
    html_file
        .read_to_string(&mut html)
        .expect("File must be readable");
    let result = html2md::parse_html(&html, false);
    assert_that(&result).is_equal_to("xx, xx xxxxx x xxxxxx xxxxxxxx xxxxx xxxxxxxxx xxxx xx xxxx xxxx xxxxxxxx.\nxxxx, xxx xx xxxxx xx xxxxxxxxxxx xxxx.\nxxxxxxxxxxx:\n* xxxxxxx x xxxxxxxxx (xxxxx)\n* xxxxxxx xx xxxxxx xxxxxxx, xxxxxxxxxx xxxxxxxxxx xxxx\n* xxxxxxxxx xx xxxxx, xx xxxxxx xx xxxxxxxxxxx\n* xxxxxxx xxxxxx xxxxxxxxx x xxxxxxxxxx, xxxxxxx xxxxxx x xxxxxxx, x xxxxxx.\n* xx xx, xxxxxx xx xxxxxxxx, xx-xxxx xxx x xxxxxxx xxx xxx, xxxxxxx xx xxxx. xxxxxxxxx xx x.\nxxxxx:\n1. xxxxxxxxx xxxxxxxxxx - xxxxx -\\_- !\n2. xxxxxx Mother of Learning - xxxx, xxxxxxx, xxxxxxxxxxxx\n3. xxxxxx xxxxxxx xxxxxxx, xxxxxxxx \"xxx xxxxx\". xxxxx xxxxx xxxx, xx x xxxxx xxxxxxx.\n4. xxxxxxxx! xxxx xxx xxxxxxxxx xxxx xxx, xx x xxxxxxxxx.\n5. xxxx xxxxxx - xxxxxx xxxxxxxx xxx x 15-17, xxxxxx xxxxxxxxxxxxx xx xxxxxxxx xxx xxxxxxx xxxxxx.\nxxx xxxx, xxxxx x xxxxxxxxx xx xxxxxxxxxx xxxxxx. xxxxxxxxx spelling puns, xxxxxxx, x xxxxxxxxx, xxxxxxxx xxx xxxxxxxx, xxxxxx xxxxxxxxxx xxxxxx.\nxxx xxxxxxx. xxx xxx xxxxxxxx xxxxxx - x x xxxxxxxxxxx xxxxx xxxx xxxxxxxxxx xxx xxxxx, x xxxxxx xxx xxxxxxxx xxxxxxxxxx xxx xxxxx. xx xxxxxx xxxxxxxx:\n* xxx xxxxx x xxx-xxxx xxxxxxxxx. xxxxxx xxx xxxx xxxxxxxx. x xx x xx xxxxxxxx, xx x xxxxxxx xxxxxx xxxxxx xx xxxxxxxxx. xxxxxxxxxx xxxx xxxxx xxxxxx xxxxxxxxx xxxxxxx xx xxxx.\n* xxxxxx xxxx Kotlin, x xxxxxxx. xxxxxxxxxx, xxxxxxxxxx xxx xxxxx xx xxx x xxxxxxxx\n* xxx xxxxx xxxxxxxxxx Rust, xxx xxx x xx xxx xxxx xxxxxxxxx xxxxxxxxxxxxxx xxxx xxx xxxxx, xxxxxxxx xxxxxxxxxxxxxx HTML x Markdown\n* xxx xxxx xxxxxx xxx xxxxxxxx xxxxxx. xx xxxx xxx - xxxxxxxxxxxxx xxxxxxxxxxx xxxxxx x xxxxxxxxx xxxxx x xxxxxxx.\n* xxxxxxxxx xxxx xxxxxxxx xxxxxxx xx FUSE 3.0. xxxxx xxxxxxx xxxxxxx xxx xxxxxxxxxxx.\n* x xxxxxxxx xxxx xxxxxxxx DevOps-xxxxxxx x xxxxx xxxxxxx. xxxxxxxxx, xxx xx xxxxx xxxxxx. x, xx, xxx xxx xxx xxxxxxxxx?\nxxxxx xx xxx:\n\\- xxxxxxxx xxxxxxxx\n\\- xxxxxxx xxxxxxxxx, xxxxxxx xxxxx xxxxx xxxxxxxx\n\\- xxxxxxxxxx xxxx Machine Learning, xxxx xxxxxx xxx xxxxxxxx OpenCL.".to_string());
}

#[test]
#[cfg(feature = "scraper")]
fn test_lists_from_text() {
    let mut html = String::new();
    let mut html_file = File::open("../test-samples/dybr-bug-with-lists-from-text.html").unwrap();
    html_file
        .read_to_string(&mut html)
        .expect("File must be readable");
    let result = html2md::parse_html(&html, false);
    assert_that(&result).contains("\\- x xxxx xxxxx xx xxxxxxxxxx");
    assert_that(&result).contains("\\- x xxxx xxxxxxxx xxxxxxxxx xxxxxx xxx x xxxxxxxx xxxx");
    assert_that(&result).contains("\\- xxxx xxxxxxxx");
}

#[test]
#[cfg(feature = "scraper")]
fn test_strong_inside_link() {
    let mut html = String::new();
    let mut html_file =
        File::open("../test-samples/dybr-bug-with-strong-inside-link.html").unwrap();
    html_file
        .read_to_string(&mut html)
        .expect("File must be readable");
    let result = html2md::parse_html(&html, false);
    assert_that(&result).contains("[**Just God**](http://fanfics.me/ficXXXXXXX)");
}

#[test]
#[cfg(feature = "scraper")]
fn test_tables_with_newlines() {
    let mut html = String::new();
    let mut html_file = File::open("../test-samples/dybr-bug-with-tables-masked.html").unwrap();
    html_file
        .read_to_string(&mut html)
        .expect("File must be readable");
    let result = html2md::parse_html(&html, false);
    let m = indoc! { "[![Embedded YouTube video](https://img.youtube.com/vi/ZZZZZZZZZ/0.jpg)](https://www.youtube.com/watch?v=ZZZZZZZZZ)\n|Maybe I'm foolish, maybe I'm blind\nThinking I can see through this and see what's behind\nGot no way to prove it so maybe I'm blind\nBut I'm only human after all,\nI'm only human after all\nDon't put your blame on me|xxxxx xxxx, x xxxxxx, xxxxx xxxx — xxxxxx\nxxx xxxxx, xxx xxxx xxxxxx xxxxxx xxx, x xxxxxx xxx xxx xx xxx\nxxxx x xxxx xx xxxx xxxxxxx xxxxxxxxxxxxx, xxx xxx xxxxxxxx, x xxxxxx.\nxx x xxxxx xxxx xxxxxxx, x xxxxx-xx xxxxxx,\nx xxxxx xxxx xxxxxxx, x xxxxx xxxxxx.\nxx xxxx xxxx|\n|||\n[xxxxxx xxxxx xxxxx x xxxxxxx](/)\nx xxxx xxxxxxxxx xxxxxxx xxxxxxxxxxx xx xxxx xxxxx. x xxxxx xxxxxxx, xxxx xxxxx xxxxxxx xx xxxxxxxxxx xxxxxx. xxx xxxxxxxx, xxx xxxxxxxxx xxxxxxxxxxxxxx xx xxxxx — xxxxxxxxxx xxxxxxxxxx x xxxxx xxxxxxxxxxxxx xxxxxxxxx. x xxx xxxxxxxxxxxx *xxxx*, xxxxxx xxxx, xxxxxxxxxx xxxxx xxxxxxxx, xxxxxxxxxx x xxxxxxxxx. xx xxxxxx xxxxx xxxxxxxxxxxxxxxxx — x xxxxxx xxx xxxx.\nxxxxx xxxxxxxxxx xxxxx x xxxx xxxxxxxxxx xxxxx. xxxxx. x xxxxx: «x xxxxxx xxxxxxx, x xxxxx xxx xxxx, xx xxxxxxxx xxxxxx», — xxx xxxxx xxxxxxxx. xxxxxx xxx x xxxx xxxx xxxxxxxx xxxxxxxx xxxxxxx xxxx xxxxxxxxxxx xxxxxxxxxx, xxxxxxx xxxxxx xxxxxx xxx xxxxx, xxxxxxxxxxx x x xxxxxxx xxxxxxxxx.\nxx x xxxxx xxxx xxxxxxx. xxxxxx xxxxx? xxxxxxxxxxx x xxxxxxxxx xxxxxx.\nx xxxxx x xxxxxxxxxx x xxxxx... x xxxxxx xxxx xxxxxx xxxxxxx xxxxxxxx. xx xxxx, x xxxxxx xxx-xx xxxxxxxxx xx xxxxxxx, xxx xxxxxx xxxxxx, xxx xxx xxxxx, xxxxx xxxxxxxx xx xxxx... x xxxxxx xxxxxxx xx xxxx xxxxx, xxx, xxxxx xxxx xxxxxxxxxx, x xxxxx xxxxxxxxx xx xxxxx. x xxx-xx xxx xxxxx xxxxxxx xxxxxxxxxxxxx.\nxxxxxx xx... xx xxx xx xxxxxxxxxxxxx xxxxxx xxxxxxxxxxxxx x xxxxxxxxxx xxxxx, xxxxx xxx xxxx xxxxxxxxx, x xxxxx xxx xxxxxxxxx, xxx xxxxxxx xxx, xxx xxxx xxxxxxx xxxxxx, x xx xxx, xxx xxxx xxxxxxxx." };

    assert_that!(result).contains(m);
    // let result = html2md::rewrite_html(&html, false);
    // assert_that!(result).contains(m);
}

#[test]
#[ignore]
#[cfg(feature = "scraper")]
fn test_tables_crash2() {
    let mut html = String::new();
    let mut html_file = File::open("../test-samples/dybr-bug-with-tables-2-masked.html").unwrap();
    html_file
        .read_to_string(&mut html)
        .expect("File must be readable");
    let table_with_vertical_header = html2md::parse_html(&html, false);
    let m = indoc! {"xxxxx xxxxxxxxxx xxxxxxx x xxxxx)) ~~xxxxxxxx xxxxxxxx~~\n## At a Glance\n \n|Current Conditions:|Open all year. No reservations. No services.|\n|||\n| Reservations: | No reservations. |\n| Fees | No fee. |\n| Water: | No water. |"};

    assert_that!(table_with_vertical_header).contains(m);

    let table_with_vertical_header = html2md::rewrite_html(&html, false);

    // todo: fix spacing.
    let m = indoc! {"xxxxx xxxxxxxxxx xxxxxxx x xxxxx)) ~~xxxxxxxx xxxxxxxx~~\n## At a Glance\n|Current Conditions:|\nOpen all year. No reservations. No services.|\n|Reservations:|\nNo reservations.|\n|Fees|\nNo fee.|\n|Water:|\nNo water.|"};

    assert_that!(table_with_vertical_header).contains(m);
}

#[test]
#[cfg(feature = "scraper")]
fn test_html_from_text() {
    let mut html = String::new();
    let mut html_file = File::open("../test-samples/real-world-1.html").unwrap();
    html_file
        .read_to_string(&mut html)
        .expect("File must be readable");

    let mut tag_factory: HashMap<String, Box<dyn html2md::scraper::TagHandlerFactory>> =
        HashMap::new();
    let tag = Box::new(html2md::scraper::ignore::IgnoreTagFactory {});

    tag_factory.insert(String::from("script"), tag.clone());
    tag_factory.insert(String::from("style"), tag.clone());
    tag_factory.insert(String::from("noscript"), tag.clone());

    tag_factory.insert(String::from("iframe"), tag);

    let result = html2md::parse_html_custom_with_url(
        &html,
        &tag_factory,
        false,
        &Some(Url::parse("https://spider.cloud").unwrap()),
    );

    assert!(!result.is_empty());
}

#[test]
#[cfg(all(feature = "stream", feature = "rewriter"))]
fn test_html_from_text_rewrite() {
    let mut html = Box::new(String::new());
    let mut html_file = File::open("../test-samples/real-world-1.html").unwrap();

    html_file
        .read_to_string(&mut html)
        .expect("File must be readable");

    let result = html2md::rewrite_html_custom_with_url(
        &html,
        &None,
        false,
        &Some(Url::parse("https://spider.cloud").unwrap()),
    );

    assert!(!result.is_empty());
}

const SPIDER_RESULT_MD: &str = r#"To help you get started with Spider, we’ll give you $200 in credits when you spend $100.[Terms apply](https://spider.cloud/promotion-spider-credits)
# The Web Crawler for AI Agents and LLMs
Spider offers the finest data collecting solution. Engineered for speed and scalability, it
allows you to elevate your AI projects.
[Get Started](https://spider.cloud/credits/new)View Preview
* Basic
* Streaming
Example request
Python
JSONL
Copy
```
`import requests, os, json
headers = {
&#x27;&#x27;Authorization &#x27;&#x27;: f &#x27;&#x27;Bearer {os.getenv(&quot;&quot;SPIDER\_API\_KEY &quot;&quot;)}&#x27;&#x27;,
&#x27;&#x27;Content-Type &#x27;&#x27;: &#x27;&#x27;application/jsonl &#x27;&#x27;,
}
json\_data = {&quot;&quot;limit &quot;&quot;:50,&quot;&quot;metadata &quot;&quot;:True,&quot;&quot;url &quot;&quot;:&quot;&quot;https://spider.cloud &quot;&quot;}
response = requests.post(&#x27;&#x27;https://api.spider.cloud/crawl &#x27;&#x27;, headers=headers, json=json\_data, stream=True)
with response as r:
r.raise\_for\_status()
for chunk in r.iter\_lines(
chunk\_size=None, decode\_unicode=True
):
data = json.loads(chunk)
print(data)`
```
[Free Trial](https://spider.cloud/credits/new?free-trial=1)
Example Response
## Built with the need for**Speed**
Experience the power of**Spider**, built fully in**Rust**for
next-generation scalability.
### 2.4secs
To crawl over 20,000 pages
### 500-1000x
Faster than alternatives
### 500x
Cheaper than traditional scraping services
Benchmarks displaying performance between Spider API request modes.
Spider API Request Modes &middot;Benchmarked tailwindcss.com &middot;06/16/2024
[See framework benchmarks](https://github.com/spider-rs/spider/blob/main/benches/BENCHMARKS.md)
### Seamless Integrations
Seamlessly integrate Spider with a wide range of platforms, ensuring data curation
perfectly aligned with your requirements. Compatible with all major AI tools.
[LangChain integration](https://python.langchain.com/docs/integrations/document_loaders/spider)[LlamaIndex integrationLlama Index Logo](https://docs.llamaindex.ai/en/stable/examples/data_connectors/WebPageDemo/#using-spider-reader)[CrewAI integrationCrewAI Logo](https://docs.crewai.com/tools/SpiderTool/)[FlowWiseAI integrationFlowiseAI LogoFlowiseAI](https://docs.flowiseai.com/integrations/langchain/document-loaders/spider-web-scraper-crawler)[Composio integrationComposio Logo](https://docs.composio.dev/introduction/foundations/components/list_local_tools#spider-crawler)[PhiData integrationPhiData Logo](https://docs.phidata.com/tools/spider)
### Concurrent Streaming
Save time and money without having to worry about bandwidth concerns by effectively
streaming all the results concurrently. The latency cost that is saved becomes drastic as
you crawl more websites.
### Warp Speed
Powered by the cutting-edge[Spider](https://github.com/spider-rs/spider)open-source project, our robust Rust engine scales effortlessly to handle extreme
workloads. We ensure continuous maintenance and improvement for top-tier performance.
## Kickstart Your Data Collecting Projects Today
Jumpstart web crawling with full elastic scaling concurrency, optimal formats, and AI scraping.
### Performance Tuned
Spider is written in Rust and runs in full concurrency to achieve crawling thousands of
pages in secs.
### Multiple response formats
Get clean and formatted markdown, HTML, or text content for fine-tuning or training AI
models.
### Caching
Further boost speed by caching repeated web page crawls to minimize expenses while
building.
### Smart Mode
Spider dynamically switches to Headless Chrome when it needs to quick.
Beta
### Scrape with AI
Do custom browser scripting and data extraction using the latest AI models with no cost
step caching.
### The crawler for LLMs
Don't let crawling and scraping be the highest latency in your LLM & AI agent stack.
### Scrape with no headaches
* Auto Proxy rotations
* Agent headers
* Anti-bot detections
* Headless chrome
* Markdown responses
### The Fastest Web Crawler
* Powered by[spider-rs](https://github.com/spider-rs/spider)
* 100,000 pages/seconds
* Unlimited concurrency
* Simple API
* 50,000 RPM
### Do more with AI
* Browser scripting
* Advanced extraction
* Data pipelines
* Ideal for LLMs and AI Agents
* Accurate labeling
## Achieve more with these new API features
Our API is set to stream so you can act in realtime.
![A user interface with a search bar containing the text &#34;Latest sports news,&#34; a green &#34;Submit&#34; button, and two icon buttons to display searching and extracting with the service.](/img/search_feature.webp)
### Search
Get access to search engine results from anywhere and easily crawl and transform pages to
LLM-ready markdown.
[Explore SearchRight Arrow](https://spider.cloud/docs/api#search)
![A user interface segment showing three icons representing different stages of data transformation.](/img/transform_feature_example.webp)
### Transform
Convert raw HTML into markdown easily by using this API. Transform thousands of html pages
in seconds.
[Explore TransformRight Arrow](https://spider.cloud/docs/api#transform)
## Join the community
Backed by a network of early advocates, contributors, and supporters.
[GitHub discussions
Chat Icon
](https://github.com/orgs/spider-rs/discussions)[Discord
Chat Icon
](https://discord.spider.cloud)
[
![iammerrick's avatar](/img/external/iammerrick_twitter.webp)
@iammerrick
Rust based crawler Spider is next level for crawling &amp;scraping sites. So fast.
Their cloud offering is also so easy to use. Good stuff. https://github.com/spider-rs/spider
](https://twitter.com/iammerrick/status/1787873425446572462)
[
![WilliamEspegren's avatar](/img/external/william_twitter.webp)
@WilliamEspegren
Web crawler built in rust, currently the nr1 performance in the world with crazy resource management Aaaaaaand they have a cloud offer, that’s wayyyy cheaper than any competitor
Name a reason for me to use anything else?
github.com/spider-rs/spid…
](https://twitter.com/WilliamEspegren/status/1789419820821184764)
[
![gasa's avatar](/img/external/gaza_twitter.webp)
@gasa
@gasathenaper
is the best crawling tool i have used. I had a complicated project where i needed to paste url and get the website whole website data. Spider does it in an instant
](https://x.com/gasathenaper/status/1810612492596383948)
[
![Ashpreet Bedi's avatar](/img/external/ashpreet_bedi.webp)
@Ashpreet Bedi
@ashpreetbedi
is THE best crawler out there, give it a try
](https://x.com/ashpreetbedi/status/1815512219003572315?s=46&t=37F5QP_8oKqOsNpHSo6VVw)
[
![Troyusrex's avatar](/img/external/troy_twitter.webp)
@Troyusrex
I found a new tool, Spider-rs, which scrapes significantly faster and handles more scenarios than the basic scraper I built did. Our use of Spider-rs and AWS infrastructure reduced the scraping time from four months to under a week.
](https://medium.com/@troyusrex/inside-my-virtual-college-advisor-a-deep-dive-into-rag-ai-and-agent-technology-84731b2928f7#1326)
[
![Dify.AI's avatar](/img/external/difyai.webp)
@Dify.AI
🕷️Spider @spider\_rust
can be used as a built-in tool in #Dify Workflow or as an LLM-callable tool in Agent. It allows fast and affordable web scraping and crawling when your AI applications need real-time web data for context.
](https://x.com/dify_ai/status/1818226971056243089)
## FAQ
Frequently asked questions about Spider.
### What is Spider?
Spider is a leading web crawling tool designed for speed and cost-effectiveness, supporting various data formats including LLM-ready markdown.
### Why is my website not crawling?
Your crawl may fail if it requires JavaScript rendering. Try setting your request to &#x27;chrome &#x27;to solve this issue.
### Can you crawl all pages?
Yes, Spider accurately crawls all necessary content without needing a sitemap.
### What formats can Spider convert web data into?
Spider outputs HTML, raw, text, and various markdown formats. It supports`JSON`,`JSONL`,`CSV`, and`XML`for API responses.
### Is Spider suitable for large scraping projects?
Absolutely, Spider is ideal for large-scale data collection and offers a cost-effective dashboard for data management.
### How can I try Spider?
Purchase credits for our cloud system or test the Open Source Spider engine to explore its capabilities.
### Does it respect robots.txt?
Yes, compliance with robots.txt is default, but you can disable this if necessary.
### Unable to get dynamic content?
If you are having trouble getting dynamic pages, try setting the request parameter to &quot;&quot;chrome &quot;&quot;or &quot;&quot;smart.&quot;&quot;You may also need to set `disable\_intercept` to allow third-party or external scripts to run.
### Why is my crawl going slow?
If you are experiencing a slow crawl, it is most likely due to the robots.txt file for the website. The robots.txt file may have a crawl delay set, and we respect the delay up to 60 seconds.
### Do you offer a Free Trial?
Yes, you can try out the service before being charged for free at[checkout](https://spider.cloud/credits/new?free-trial=1).
## Comprehensive Data Curation for Everyone
Trusted by leading tech businesses worldwide to deliver accurate and insightful data solutions.
Outer Labs
[Zapier LogoZapier](https://zapier.com/apps/spider/integrations)
Elementus Logo
Super AI Logo
LayerX Logo
Swiss Re
Write Sonic Logo
Alioth Logo
### Next generation data for AI, scale to millions
[Start now](https://spider.cloud/credits/new)
### Company
* [About](https://spider.cloud/about)
* [Privacy](https://spider.cloud/privacy)
* [Terms](https://spider.cloud/eula)
* [FAQ](https://spider.cloud/faq)
### Resources
* [API](https://spider.cloud/docs/api)
* [Docs](https://spider.cloud/docs/overview)
* [Guides](https://spider.cloud/guides)
* [Spider.rs Docs](https://docs.rs/spider/latest/spider/)
### Services
* [Pricing](https://spider.cloud/credits/new)
* [Web Crawling and Scraping](https://spider.cloud/web-crawling-and-scraping)
[All systems normal.](https://spidercloud.statuspage.io/)
[
Github LogoGitHub
](https://github.com/spider-rs/spider)[
Discord LogoDiscord
](https://discord.spider.cloud)[
Twitter LogoTwitter
](https://twitter.com/spider_rust)"#;

const EXAMPLE_RESULT_MD: &str = r###"Example Domain
# Example Domain
This domain is for use in documentation examples without needing permission. Avoid use in operations.
[Learn more](https://iana.org/domains/example)"###;

#[test]
#[ignore]
fn test_real_spider() {
    let mut html = String::new();
    let mut html_file: File = File::open("../test-samples/spider-cloud.html").unwrap();
    html_file
        .read_to_string(&mut html)
        .expect("File must be readable");
    let result = html2md::rewrite_html(&html, false);
    assert!(result == SPIDER_RESULT_MD);
}

#[tokio::test]
#[ignore]
#[cfg(all(feature = "stream", feature = "rewriter"))]
async fn test_real_spider_async() {
    let mut html = String::new();
    let mut html_file: File = File::open("../test-samples/spider-cloud.html").unwrap();
    html_file
        .read_to_string(&mut html)
        .expect("File must be readable");
    let result = html2md::rewrite_html_streaming(&html, false).await;
    assert!(result == SPIDER_RESULT_MD);
}

#[tokio::test]
#[ignore]
#[cfg(all(feature = "stream", feature = "rewriter"))]
async fn test_real_spider_async_basic() {
    let mut html = String::new();
    let mut html_file: File = File::open("../test-samples/example.html").unwrap();
    html_file
        .read_to_string(&mut html)
        .expect("File must be readable");
    let result = html2md::rewrite_html_streaming(&html, false).await;
    assert!(result == EXAMPLE_RESULT_MD);
}

#[tokio::test]
#[ignore]
#[cfg(all(feature = "stream", feature = "rewriter"))]
async fn test_real_spider_async_basic1() {
    let mut html = String::new();
    let mut html_file: File = File::open("../test-samples/post-conflict.html").unwrap();
    html_file
        .read_to_string(&mut html)
        .expect("File must be readable");
    let result = html2md::rewrite_html_streaming(&html, false).await;

    println!("{:?}", result);
    // assert!(result == EXAMPLE_RESULT_MD);
}


// ===== True async stream tests =====

/// Stream conversion of small HTML chunks must match the sync rewriter output.
#[tokio::test]
#[cfg(all(feature = "stream", feature = "rewriter"))]
async fn test_stream_matches_sync() {
    let html = "<h1>Hello</h1><p>World</p><ul><li>one</li><li>two</li></ul>";
    let expected = html2md::rewrite_html(html, false);

    // Split into small 10-byte chunks to stress chunk-boundary handling.
    let chunks: Vec<Result<&[u8], std::io::Error>> =
        html.as_bytes().chunks(10).map(Ok).collect();
    let stream = futures_util::stream::iter(chunks);

    let result = html2md::rewrite_html_stream(stream, false).await.unwrap();
    assert_eq!(result, expected);
}

/// A single large chunk should produce the same output as sync.
#[tokio::test]
#[cfg(all(feature = "stream", feature = "rewriter"))]
async fn test_stream_single_chunk() {
    let html = "<h2>Title</h2><blockquote><p>quoted</p></blockquote><p>end</p>";
    let expected = html2md::rewrite_html(html, false);

    let chunks: Vec<Result<&[u8], std::io::Error>> = vec![Ok(html.as_bytes())];
    let stream = futures_util::stream::iter(chunks);

    let result = html2md::rewrite_html_stream(stream, false).await.unwrap();
    assert_eq!(result, expected);
}

/// An empty stream should return an empty/minimal string without panicking.
#[tokio::test]
#[cfg(all(feature = "stream", feature = "rewriter"))]
async fn test_stream_empty() {
    let chunks: Vec<Result<&[u8], std::io::Error>> = vec![];
    let stream = futures_util::stream::iter(chunks);

    let result = html2md::rewrite_html_stream(stream, false).await.unwrap();
    assert!(result.is_empty() || result.trim().is_empty());
}

/// Stream error propagation: a mid-stream error must surface as StreamConvertError::Stream.
#[tokio::test]
#[cfg(all(feature = "stream", feature = "rewriter"))]
async fn test_stream_error_propagation() {
    let chunks: Vec<Result<&[u8], std::io::Error>> = vec![
        Ok(b"<h1>Hi</h1>" as &[u8]),
        Err(std::io::Error::new(std::io::ErrorKind::BrokenPipe, "boom")),
        Ok(b"<p>never reached</p>" as &[u8]),
    ];
    let stream = futures_util::stream::iter(chunks);

    let err = html2md::rewrite_html_stream(stream, false).await.unwrap_err();
    match err {
        html2md::StreamConvertError::Stream(io_err) => {
            assert_eq!(io_err.kind(), std::io::ErrorKind::BrokenPipe);
        }
        other => panic!("expected Stream variant, got: {other}"),
    }
}

/// Stream against real-world wiki files should produce non-empty valid markdown.
/// Note: chunk boundaries cause lol_html to emit text callbacks at different points
/// than the sync single-write path, so output is semantically equivalent but not
/// byte-identical. We verify non-emptiness and that re-streaming with different
/// chunk sizes is deterministic per chunk size.
#[tokio::test]
#[cfg(all(feature = "stream", feature = "rewriter"))]
async fn test_stream_real_world_wiki() -> Result<(), Box<dyn std::error::Error>> {
    use std::fs;

    let paths = fs::read_dir("../test-samples/wiki")?;

    for entry in paths {
        let path = entry?.path();
        if !path.is_file() {
            continue;
        }

        let html = fs::read_to_string(&path)?;

        // Stream with 256-byte chunks
        let chunks: Vec<Result<&[u8], std::io::Error>> =
            html.as_bytes().chunks(256).map(Ok).collect();
        let stream = futures_util::stream::iter(chunks);
        let result = html2md::rewrite_html_stream(stream, false).await.unwrap();

        assert!(
            !result.trim().is_empty(),
            "stream result empty for file: {}",
            path.display()
        );

        // Same chunk size must be deterministic
        let chunks2: Vec<Result<&[u8], std::io::Error>> =
            html.as_bytes().chunks(256).map(Ok).collect();
        let stream2 = futures_util::stream::iter(chunks2);
        let result2 = html2md::rewrite_html_stream(stream2, false).await.unwrap();
        assert_eq!(
            result, result2,
            "non-deterministic stream output for file: {}",
            path.display()
        );
    }

    Ok(())
}

/// Stream with custom URL should match the sync custom+url variant.
#[tokio::test]
#[cfg(all(feature = "stream", feature = "rewriter"))]
async fn test_stream_custom_with_url() {
    let html = r#"<a href="/path">link</a><p>text</p>"#;
    let url = Some(url::Url::parse("https://example.com").unwrap());

    let expected = html2md::rewrite_html_custom_with_url(html, &None, false, &url);

    let chunks: Vec<Result<&[u8], std::io::Error>> =
        html.as_bytes().chunks(8).map(Ok).collect();
    let stream = futures_util::stream::iter(chunks);

    let result = html2md::rewrite_html_stream_custom_with_url(stream, &None, false, &url)
        .await
        .unwrap();
    assert_eq!(result, expected);
}

#[tokio::test]
#[ignore]
#[cfg(all(feature = "stream", feature = "rewriter"))]
async fn test_real_spider_async_basic2() {
    let mut html = String::new();
    let mut html_file: File = File::open("../test-samples/example.html").unwrap();
    html_file
        .read_to_string(&mut html)
        .expect("File must be readable");
    let result = html2md::rewrite_html_streaming(&html, false).await;

    println!("{:?}", result);
    // assert!(result == EXAMPLE_RESULT_MD);
}