r2rs_mass/data/mod.rs
1use std::io::Cursor;
2
3use polars::prelude::*;
4
5/// # Determinations of Nickel Content
6///
7/// ## Description:
8///
9/// A numeric vector of 31 determinations of nickel content (ppm) in a
10/// Canadian syenite rock.
11///
12/// ## Usage:
13///
14/// abbey
15///
16/// ## Source:
17///
18/// S. Abbey (1988) _Geostandards Newsletter_ *12*, 241.
19///
20/// ## References:
21///
22/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
23/// Statistics with S._ Fourth edition. Springer.
24pub fn abbey() -> PolarsResult<DataFrame> {
25 CsvReader::new(Cursor::new(include_str!("abbey.csv"))).finish()
26}
27
28/// # Accidental Deaths in the US 1973-1978
29///
30/// ## Description:
31///
32/// A regular time series giving the monthly totals of accidental
33/// deaths in the USA.
34///
35/// ## Usage:
36///
37/// accdeaths
38///
39/// ## Details:
40///
41/// The values for first six months of 1979 (p. 326) were ‘7798 7406
42/// 8363 8460 9217 9316’.
43///
44/// ## Source:
45///
46/// P. J. Brockwell and R. A. Davis (1991) _Time Series: Theory and
47/// Methods._ Springer, New York.
48///
49/// ## References:
50///
51/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
52/// Statistics with S-PLUS._ Fourth Edition. Springer.
53pub fn accdeaths() -> PolarsResult<DataFrame> {
54 CsvReader::new(Cursor::new(include_str!("accdeaths.csv"))).finish()
55}
56
57/// # Australian AIDS Survival Data
58///
59/// ## Description:
60///
61/// Data on patients diagnosed with AIDS in Australia before 1 July
62/// 1991.
63///
64/// ## Usage:
65///
66/// Aids2
67///
68/// ## Format:
69///
70/// This data frame contains 2843 rows and the following columns:
71///
72/// * ‘state’ Grouped state of origin: ‘"NSW "’includes ACT and
73/// ‘"other"’ is WA, SA, NT and TAS.
74/// * ‘sex’ Sex of patient.
75/// * ‘diag’ (Julian) date of diagnosis.
76/// * ‘death’ (Julian) date of death or end of observation.
77/// * ‘status’ ‘"A"’ (alive) or ‘"D"’ (dead) at end of observation.
78/// * ‘T.categ’ Reported transmission category.
79/// * ‘age’ Age (years) at diagnosis.
80///
81/// ## Note:
82///
83/// This data set has been slightly jittered as a condition of its
84/// release, to ensure patient confidentiality.
85///
86/// ## Source:
87///
88/// Dr P. J. Solomon and the Australian National Centre in HIV
89/// Epidemiology and Clinical Research.
90///
91/// ## References:
92///
93/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
94/// Statistics with S._ Fourth edition. Springer.
95pub fn aids2() -> PolarsResult<DataFrame> {
96 CsvReader::new(Cursor::new(include_str!("Aids2.csv"))).finish()
97}
98
99/// # Brain and Body Weights for 28 Species
100///
101/// ## Description:
102///
103/// Average brain and body weights for 28 species of land animals.
104///
105/// ## Usage:
106///
107/// Animals
108///
109/// ## Format:
110///
111/// * ‘body’ body weight in kg.
112/// * ‘brain’ brain weight in g.
113///
114/// ## Note:
115///
116/// The name ‘Animals’ avoided conflicts with a system dataset
117/// ‘animals’ in S-PLUS 4.5 and later.
118///
119/// ## Source:
120///
121/// P. J. Rousseeuw and A. M. Leroy (1987) _Robust Regression and
122/// Outlier Detection._ Wiley, p. 57.
123///
124/// ## References:
125///
126/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
127/// Statistics with S-PLUS._ Fourth Edition. Springer.
128pub fn animals() -> PolarsResult<DataFrame> {
129 CsvReader::new(Cursor::new(include_str!("Animals.csv"))).finish()
130}
131
132/// # Anorexia Data on Weight Change
133///
134/// ## Description:
135///
136/// The ‘anorexia’ data frame has 72 rows and 3 columns. Weight
137/// change data for young female anorexia patients.
138///
139/// ## Usage:
140///
141/// anorexia
142///
143/// ## Format:
144///
145/// This data frame contains the following columns:
146///
147/// * ‘Treat’ Factor of three levels: ‘"Cont"’ (control), ‘"CBT"’
148/// (Cognitive Behavioural treatment) and ‘"FT"’ (family
149/// treatment).
150/// * ‘Prewt’ Weight of patient before study period, in lbs.
151/// * ‘Postwt’ Weight of patient after study period, in lbs.
152///
153/// ## Source:
154///
155/// Hand, D. J., Daly, F., McConway, K., Lunn, D. and Ostrowski, E.
156/// eds (1993) _A Handbook of Small Data Sets._ Chapman & Hall, Data
157/// set 285 (p. 229)
158///
159/// (Note that the original source mistakenly says that weights are in
160/// kg.)
161///
162/// ## References:
163///
164/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
165/// Statistics with S._ Fourth edition. Springer.
166pub fn anorexia() -> PolarsResult<DataFrame> {
167 CsvReader::new(Cursor::new(include_str!("anorexia.csv"))).finish()
168}
169
170/// # Presence of Bacteria after Drug Treatments
171///
172/// ## Description:
173///
174/// Tests of the presence of the bacteria _H. influenzae_ in children
175/// with otitis media in the Northern Territory of Australia.
176///
177/// ## Usage:
178///
179/// bacteria
180///
181/// ## Format:
182///
183/// This data frame has 220 rows and the following columns:
184///
185/// * y presence or absence: a factor with levels ‘n’ and ‘y’.
186/// * ap active/placebo: a factor with levels ‘a’ and ‘p’.
187/// * hilo hi/low compliance: a factor with levels ‘hi’ amd ‘lo’.
188/// * week numeric: week of test.
189/// * ID subject ID: a factor.
190/// * trt a factor with levels ‘placebo’, ‘drug’ and ‘drug+’, a
191/// re-coding of ‘ap’ and ‘hilo’.
192///
193/// ## Details:
194///
195/// Dr A. Leach tested the effects of a drug on 50 children with a
196/// history of otitis media in the Northern Territory of Australia.
197/// The children were randomized to the drug or the a placebo, and
198/// also to receive active encouragement to comply with taking the
199/// drug.
200///
201/// The presence of _H. influenzae_ was checked at weeks 0, 2, 4, 6
202/// and 11: 30 of the checks were missing and are not included in this
203/// data frame.
204///
205/// ## Source:
206///
207/// Dr Amanda Leach _via_ Mr James McBroom.
208///
209/// ## References:
210///
211/// Menzies School of Health Research 1999-2000 Annual Report. p.20.
212/// <https://www.menzies.edu.au/icms_docs/172302_2000_Annual_report.pdf>.
213///
214/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
215/// Statistics with S._ Fourth edition. Springer.
216///
217/// ## Examples:
218///
219/// ```r
220/// contrasts(bacteria$trt) <- structure(contr.sdif(3),
221/// dimnames = list(NULL, c("drug", "encourage")))
222/// ## fixed effects analyses
223/// ## IGNORE_RDIFF_BEGIN
224/// summary(glm(y ~ trt * week, binomial, data = bacteria))
225/// summary(glm(y ~ trt + week, binomial, data = bacteria))
226/// summary(glm(y ~ trt + I(week > 2), binomial, data = bacteria))
227/// ## IGNORE_RDIFF_END
228///
229/// # conditional random-effects analysis
230/// library(survival)
231/// bacteria$Time <- rep(1, nrow(bacteria))
232/// coxph(Surv(Time, unclass(y)) ~ week + strata(ID),
233/// data = bacteria, method = "exact")
234/// coxph(Surv(Time, unclass(y)) ~ factor(week) + strata(ID),
235/// data = bacteria, method = "exact")
236/// coxph(Surv(Time, unclass(y)) ~ I(week > 2) + strata(ID),
237/// data = bacteria, method = "exact")
238///
239/// # PQL glmm analysis
240/// library(nlme)
241/// ## IGNORE_RDIFF_BEGIN
242/// summary(glmmPQL(y ~ trt + I(week > 2), random = ~ 1 | ID,
243/// family = binomial, data = bacteria))
244/// ## IGNORE_RDIFF_END
245/// ```
246pub fn bacteria() -> PolarsResult<DataFrame> {
247 CsvReader::new(Cursor::new(include_str!("bacteria.csv"))).finish()
248}
249
250/// # Body Temperature Series of Beaver 1
251///
252/// ## Description:
253///
254/// Reynolds (1994) describes a small part of a study of the long-term
255/// temperature dynamics of beaver _Castor canadensis_ in
256/// north-central Wisconsin. Body temperature was measured by
257/// telemetry every 10 minutes for four females, but data from a one
258/// period of less than a day for each of two animals is used there.
259///
260/// ## Usage:
261///
262/// beav1
263///
264/// ## Format:
265///
266/// The ‘beav1’ data frame has 114 rows and 4 columns. This data
267/// frame contains the following columns:
268///
269/// * ‘day’ Day of observation (in days since the beginning of 1990),
270/// December 12-13.
271/// * ‘time’ Time of observation, in the form ‘0330’ for 3.30am.
272/// * ‘temp’ Measured body temperature in degrees Celsius.
273/// * ‘activ’ Indicator of activity outside the retreat.
274///
275/// ## Note:
276///
277/// The observation at 22:20 is missing.
278///
279/// ## Source:
280///
281/// P. S. Reynolds (1994) Time-series analyses of beaver body
282/// temperatures. Chapter 11 of Lange, N., Ryan, L., Billard, L.,
283/// Brillinger, D., Conquest, L. and Greenhouse, J. eds (1994) _Case
284/// Studies in Biometry._ New York: John Wiley and Sons.
285///
286/// ## References:
287///
288/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
289/// Statistics with S._ Fourth edition. Springer.
290///
291/// ## See Also:
292///
293/// ‘beav2’
294///
295/// ## Examples:
296///
297/// ```r
298/// beav1 <- within(beav1,
299/// hours <- 24*(day-346) + trunc(time/100) + (time%%100)/60)
300/// plot(beav1$hours, beav1$temp, type="l", xlab="time",
301/// ylab="temperature", main="Beaver 1")
302/// usr <- par("usr"); usr[3:4] <- c(-0.2, 8); par(usr=usr)
303/// lines(beav1$hours, beav1$activ, type="s", lty=2)
304/// temp <- ts(c(beav1$temp[1:82], NA, beav1$temp[83:114]),
305/// start = 9.5, frequency = 6)
306/// activ <- ts(c(beav1$activ[1:82], NA, beav1$activ[83:114]),
307/// start = 9.5, frequency = 6)
308///
309/// acf(temp[1:53])
310/// acf(temp[1:53], type = "partial")
311/// ar(temp[1:53])
312/// act <- c(rep(0, 10), activ)
313/// X <- cbind(1, act = act[11:125], act1 = act[10:124],
314/// act2 = act[9:123], act3 = act[8:122])
315/// alpha <- 0.80
316/// stemp <- as.vector(temp - alpha*lag(temp, -1))
317/// sX <- X[-1, ] - alpha * X[-115,]
318/// beav1.ls <- lm(stemp ~ -1 + sX, na.action = na.omit)
319/// summary(beav1.ls, correlation = FALSE)
320/// rm(temp, activ)
321/// ```
322pub fn beav1() -> PolarsResult<DataFrame> {
323 CsvReader::new(Cursor::new(include_str!("beav1.csv"))).finish()
324}
325
326/// # Body Temperature Series of Beaver 2
327///
328/// ## Description:
329///
330/// Reynolds (1994) describes a small part of a study of the long-term
331/// temperature dynamics of beaver _Castor canadensis_ in
332/// north-central Wisconsin. Body temperature was measured by
333/// telemetry every 10 minutes for four females, but data from a one
334/// period of less than a day for each of two animals is used there.
335///
336/// ## Usage:
337///
338/// beav2
339///
340/// ## Format:
341///
342/// * The ‘beav2’ data frame has 100 rows and 4 columns. This data
343/// frame contains the following columns:
344/// * ‘day’ Day of observation (in days since the beginning of 1990),
345/// November 3-4.
346/// * ‘time’ Time of observation, in the form ‘0330’ for 3.30am.
347/// * ‘temp’ Measured body temperature in degrees Celsius.
348/// * ‘activ’ Indicator of activity outside the retreat.
349///
350/// ## Source:
351///
352/// P. S. Reynolds (1994) Time-series analyses of beaver body
353/// temperatures. Chapter 11 of Lange, N., Ryan, L., Billard, L.,
354/// Brillinger, D., Conquest, L. and Greenhouse, J. eds (1994) _Case
355/// Studies in Biometry._ New York: John Wiley and Sons.
356///
357/// ## References:
358///
359/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
360/// Statistics with S._ Fourth edition. Springer.
361///
362/// ## See Also:
363///
364/// ‘beav1’
365///
366/// ## Examples:
367///
368/// ```r
369/// attach(beav2)
370/// beav2$hours <- 24*(day-307) + trunc(time/100) + (time%%100)/60
371/// plot(beav2$hours, beav2$temp, type = "l", xlab = "time",
372/// ylab = "temperature", main = "Beaver 2")
373/// usr <- par("usr"); usr[3:4] <- c(-0.2, 8); par(usr = usr)
374/// lines(beav2$hours, beav2$activ, type = "s", lty = 2)
375///
376/// temp <- ts(temp, start = 8+2/3, frequency = 6)
377/// activ <- ts(activ, start = 8+2/3, frequency = 6)
378/// acf(temp[activ == 0]); acf(temp[activ == 1]) # also look at PACFs
379/// ar(temp[activ == 0]); ar(temp[activ == 1])
380///
381/// arima(temp, order = c(1,0,0), xreg = activ)
382/// dreg <- cbind(sin = sin(2*pi*beav2$hours/24), cos = cos(2*pi*beav2$hours/24))
383/// arima(temp, order = c(1,0,0), xreg = cbind(active=activ, dreg))
384///
385/// ## IGNORE_RDIFF_BEGIN
386/// library(nlme) # for gls and corAR1
387/// beav2.gls <- gls(temp ~ activ, data = beav2, correlation = corAR1(0.8),
388/// method = "ML")
389/// summary(beav2.gls)
390/// summary(update(beav2.gls, subset = 6:100))
391/// detach("beav2"); rm(temp, activ)
392/// ## IGNORE_RDIFF_END
393/// ```
394pub fn beav2() -> PolarsResult<DataFrame> {
395 CsvReader::new(Cursor::new(include_str!("beav2.csv"))).finish()
396}
397
398/// # Biopsy Data on Breast Cancer Patients
399///
400/// ## Description:
401///
402/// This breast cancer database was obtained from the University of
403/// Wisconsin Hospitals, Madison from Dr. William H. Wolberg. He
404/// assessed biopsies of breast tumours for 699 patients up to 15 July
405/// 1992; each of nine attributes has been scored on a scale of 1 to
406/// 10, and the outcome is also known. There are 699 rows and 11
407/// columns.
408///
409/// ## Usage:
410///
411/// biopsy
412///
413/// ## Format:
414///
415/// This data frame contains the following columns:
416///
417/// * ‘ID’ sample code number (not unique).
418/// * ‘V1’ clump thickness.
419/// * ‘V2’ uniformity of cell size.
420/// * ‘V3’ uniformity of cell shape.
421/// * ‘V4’ marginal adhesion.
422/// * ‘V5’ single epithelial cell size.
423/// * ‘V6’ bare nuclei (16 values are missing).
424/// * ‘V7’ bland chromatin.
425/// * ‘V8’ normal nucleoli.
426/// * ‘V9’ mitoses.
427/// * ‘class’ ‘"benign"’ or ‘"malignant"’.
428///
429/// ## Source:
430///
431/// P. M. Murphy and D. W. Aha (1992). UCI Repository of machine
432/// learning databases. \[Machine-readable data repository\]. Irvine,
433/// CA: University of California, Department of Information and
434/// Computer Science.
435///
436/// O. L. Mangasarian and W. H. Wolberg (1990) Cancer diagnosis via
437/// linear programming. _SIAM News_ *23*, pp 1 & 18.
438///
439/// William H. Wolberg and O.L. Mangasarian (1990) Multisurface method
440/// of pattern separation for medical diagnosis applied to breast
441/// cytology. _Proceedings of the National Academy of Sciences,
442/// U.S.A._ *87*, pp. 9193-9196.
443///
444/// O. L. Mangasarian, R. Setiono and W.H. Wolberg (1990) Pattern
445/// recognition via linear programming: Theory and application to
446/// medical diagnosis. In _Large-scale Numerical Optimization_ eds
447/// Thomas F. Coleman and Yuying Li, SIAM Publications, Philadelphia,
448/// pp 22-30.
449///
450/// K. P. Bennett and O. L. Mangasarian (1992) Robust linear
451/// programming discrimination of two linearly inseparable sets.
452/// _Optimization Methods and Software_ *1*, pp. 23-34 (Gordon &
453/// Breach Science Publishers).
454///
455/// ## References:
456///
457/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
458/// Statistics with S-PLUS._ Fourth Edition. Springer.
459pub fn biopsy() -> PolarsResult<DataFrame> {
460 CsvReader::new(Cursor::new(include_str!("biopsy.csv"))).finish()
461}
462
463/// # Risk Factors Associated with Low Infant Birth Weight
464///
465/// ## Description:
466///
467/// The ‘birthwt’ data frame has 189 rows and 10 columns. The data
468/// were collected at Baystate Medical Center, Springfield, Mass
469/// during 1986.
470///
471/// ## Usage:
472///
473/// birthwt
474///
475/// ## Format:
476///
477/// This data frame contains the following columns:
478///
479/// * ‘low’ indicator of birth weight less than 2.5 kg.
480/// * ‘age’ mother's age in years.
481/// * ‘lwt’ mother's weight in pounds at last menstrual period.
482/// * ‘race’ mother's race (‘1’ = white, ‘2’ = black, ‘3’ = other).
483/// * ‘smoke’ smoking status during pregnancy.
484/// * ‘ptl’ number of previous premature labours.
485/// * ‘ht’ history of hypertension.
486/// * ‘ui’ presence of uterine irritability.
487/// * ‘ftv’ number of physician visits during the first trimester.
488/// * ‘bwt’ birth weight in grams.
489///
490/// ## Source:
491///
492/// Hosmer, D.W. and Lemeshow, S. (1989) _Applied Logistic
493/// Regression._ New York: Wiley
494///
495/// References:
496///
497/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
498/// Statistics with S._ Fourth edition. Springer.
499///
500/// ## Examples:
501///
502/// ```r
503/// bwt <- with(birthwt, {
504/// race <- factor(race, labels = c("white", "black", "other"))
505/// ptd <- factor(ptl > 0)
506/// ftv <- factor(ftv)
507/// levels(ftv)[-(1:2)] <- "2+"
508/// data.frame(low = factor(low), age, lwt, race, smoke = (smoke > 0),
509/// ptd, ht = (ht > 0), ui = (ui > 0), ftv)
510/// })
511/// options(contrasts = c("contr.treatment", "contr.poly"))
512/// glm(low ~ ., binomial, bwt)
513/// ```
514pub fn birthwt() -> PolarsResult<DataFrame> {
515 CsvReader::new(Cursor::new(include_str!("birthwt.csv"))).finish()
516}
517
518/// # Housing Values in Suburbs of Boston
519///
520/// ## Description:
521///
522/// The ‘Boston’ data frame has 506 rows and 14 columns.
523///
524/// ## Usage:
525///
526/// Boston
527///
528/// ## Format:
529///
530/// This data frame contains the following columns:
531///
532/// * ‘crim’ per capita crime rate by town.
533/// * ‘zn’ proportion of residential land zoned for lots over 25,000
534/// sq.ft.
535/// * ‘indus’ proportion of non-retail business acres per town.
536/// * ‘chas’ Charles River dummy variable (= 1 if tract bounds river; 0
537/// otherwise).
538/// * ‘nox’ nitrogen oxides concentration (parts per 10 million).
539/// * ‘rm’ average number of rooms per dwelling.
540/// * ‘age’ proportion of owner-occupied units built prior to 1940.
541/// * ‘dis’ weighted mean of distances to five Boston employment
542/// centres.
543/// * ‘rad’ index of accessibility to radial highways.
544/// * ‘tax’ full-value property-tax rate per $10,000.
545/// * ‘ptratio’ pupil-teacher ratio by town.
546/// * ‘black’ 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by
547/// town.
548/// * ‘lstat’ lower status of the population (percent).
549/// * ‘medv’ median value of owner-occupied homes in $1000s.
550///
551/// ## Source:
552///
553/// Harrison, D. and Rubinfeld, D.L. (1978) Hedonic prices and the
554/// demand for clean air. _J. Environ. Economics and Management_ *5*,
555/// 81-102.
556///
557/// Belsley D.A., Kuh, E. and Welsch, R.E. (1980) _Regression
558/// Diagnostics. Identifying Influential Data and Sources of
559/// Collinearity._ New York: Wiley.
560pub fn boston() -> PolarsResult<DataFrame> {
561 CsvReader::new(Cursor::new(include_str!("Boston.csv"))).finish()
562}
563
564/// # Data from a cabbage field trial
565///
566/// ## Description:
567///
568/// The ‘cabbages’ data set has 60 observations and 4 variables
569///
570/// ## Usage:
571///
572/// cabbages
573///
574/// ## Format:
575///
576/// This data frame contains the following columns:
577///
578/// * ‘Cult’ Factor giving the cultivar of the cabbage, two levels:
579/// ‘c39’ and ‘c52’.
580/// * ‘Date’ Factor specifying one of three planting dates: ‘d16’, ‘d20’
581/// or ‘d21’.
582/// * ‘HeadWt’ Weight of the cabbage head, presumably in kg.
583/// * ‘VitC’ Ascorbic acid content, in undefined units.
584///
585/// ## Source:
586///
587/// Rawlings, J. O. (1988) _Applied Regression Analysis: A Research
588/// Tool._ Wadsworth and Brooks/Cole. Example 8.4, page 219.
589/// (Rawlings cites the original source as the files of the late Dr
590/// Gertrude M Cox.)
591///
592/// ## References:
593///
594/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
595/// Statistics with S-PLUS._ Fourth Edition. Springer.
596pub fn cabbages() -> PolarsResult<DataFrame> {
597 CsvReader::new(Cursor::new(include_str!("cabbages.csv"))).finish()
598}
599
600/// # Colours of Eyes and Hair of People in Caithness
601///
602/// ## Description:
603///
604/// Data on the cross-classification of people in Caithness, Scotland,
605/// by eye and hair colour. The region of the UK is particularly
606/// interesting as there is a mixture of people of Nordic, Celtic and
607/// Anglo-Saxon origin.
608///
609/// ## Usage:
610///
611/// caith
612///
613/// ## Format:
614///
615/// A 4 by 5 table with rows the eye colours (blue, light, medium,
616/// dark) and columns the hair colours (fair, red, medium, dark,
617/// black).
618///
619/// ## Source:
620///
621/// Fisher, R.A. (1940) The precision of discriminant functions.
622/// _Annals of Eugenics (London)_ *10*, 422-429.
623///
624/// ## References:
625///
626/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
627/// Statistics with S._ Fourth edition. Springer.
628///
629/// ## Examples:
630///
631/// ```r
632/// ## IGNORE_RDIFF_BEGIN
633/// ## The signs can vary by platform
634/// corresp(caith)
635/// ## IGNORE_RDIFF_END
636/// dimnames(caith)[[2]] <- c("F", "R", "M", "D", "B")
637/// par(mfcol=c(1,3))
638/// plot(corresp(caith, nf=2)); title("symmetric")
639/// plot(corresp(caith, nf=2), type="rows"); title("rows")
640/// plot(corresp(caith, nf=2), type="col"); title("columns")
641/// par(mfrow=c(1,1))
642/// ```
643pub fn caith() -> PolarsResult<DataFrame> {
644 CsvReader::new(Cursor::new(include_str!("caith.csv"))).finish()
645}
646
647/// # Data from 93 Cars on Sale in the USA in 1993
648///
649/// ## Description:
650///
651/// The ‘Cars93’ data frame has 93 rows and 27 columns.
652///
653/// ## Usage:
654///
655/// Cars93
656///
657/// ## Format:
658///
659/// This data frame contains the following columns:
660///
661/// * ‘Manufacturer’ Manufacturer.
662/// * ‘Model’ Model.
663/// * ‘Type’ Type: a factor with levels ‘"Small"’, ‘"Sporty"’,
664/// ‘"Compact"’, ‘"Midsize"’, ‘"Large"’ and ‘"Van"’.
665/// * ‘Min.Price’ Minimum Price (in $1,000): price for a basic version.
666/// * ‘Price’ Midrange Price (in $1,000): average of ‘Min.Price’ and
667/// ‘Max.Price’.
668/// * ‘Max.Price’ Maximum Price (in $1,000): price for “a premium
669/// version”.
670/// * ‘MPG.city’ City MPG (miles per US gallon by EPA rating).
671/// * ‘MPG.highway’ Highway MPG.
672/// * ‘AirBags’ Air Bags standard. Factor: none, driver only, or driver
673/// & passenger.
674/// * ‘DriveTrain’ Drive train type: rear wheel, front wheel or 4WD;
675/// (factor).
676/// * ‘Cylinders’ Number of cylinders (missing for Mazda RX-7, which has
677/// a rotary engine).
678/// * ‘EngineSize’ Engine size (litres).
679/// * ‘Horsepower’ Horsepower (maximum).
680/// * ‘RPM’ RPM (revs per minute at maximum horsepower).
681/// * ‘Rev.per.mile’ Engine revolutions per mile (in highest gear).
682/// * ‘Man.trans.avail’ Is a manual transmission version available? (yes
683/// or no, Factor).
684/// * ‘Fuel.tank.capacity’ Fuel tank capacity (US gallons).
685/// * ‘Passengers’ Passenger capacity (persons)
686/// * ‘Length’ Length (inches).
687/// * ‘Wheelbase’ Wheelbase (inches).
688/// * ‘Width’ Width (inches).
689/// * ‘Turn.circle’ U-turn space (feet).
690/// * ‘Rear.seat.room’ Rear seat room (inches) (missing for 2-seater
691/// vehicles).
692/// * ‘Luggage.room’ Luggage capacity (cubic feet) (missing for vans).
693/// * ‘Weight’ Weight (pounds).
694/// * ‘Origin’ Of non-USA or USA company origins? (factor).
695/// * ‘Make’ Combination of Manufacturer and Model (character).
696///
697/// ## Details:
698///
699/// Cars were selected at random from among 1993 passenger car models
700/// that were listed in both the _Consumer Reports_ issue and the
701/// _PACE Buying Guide_. Pickup trucks and Sport/Utility vehicles
702/// were eliminated due to incomplete information in the _Consumer
703/// Reports_ source. Duplicate models (e.g., Dodge Shadow and
704/// Plymouth Sundance) were listed at most once.
705///
706/// Further description can be found in Lock (1993).
707///
708/// ## Source:
709///
710/// Lock, R. H. (1993) 1993 New Car Data. _Journal of Statistics
711/// Education_ *1*(1). doi:10.1080/10691898.1993.11910459
712/// <https://doi.org/10.1080/10691898.1993.11910459>
713///
714/// ## References:
715///
716/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
717/// Statistics with S-PLUS._ Fourth Edition. Springer.
718pub fn cars93() -> PolarsResult<DataFrame> {
719 CsvReader::new(Cursor::new(include_str!("Cars93.csv"))).finish()
720}
721
722/// # Anatomical Data from Domestic Cats
723///
724/// ## Description:
725///
726/// The heart and body weights of samples of male and female cats used
727/// for _digitalis_ experiments. The cats were all adult, over 2 kg
728/// body weight.
729///
730/// ## Usage:
731///
732/// cats
733///
734/// ## Format:
735///
736/// This data frame contains the following columns:
737///
738/// * ‘Sex’ sex: Factor with levels ‘"F"’ and ‘"M"’.
739/// * ‘Bwt’ body weight in kg.
740/// * ‘Hwt’ heart weight in g.
741///
742/// ## Source:
743///
744/// R. A. Fisher (1947) The analysis of covariance method for the
745/// relation between a part and the whole, _Biometrics_ *3*, 65-68.
746///
747/// ## References:
748///
749/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
750/// Statistics with S._ Fourth edition. Springer.
751pub fn cats() -> PolarsResult<DataFrame> {
752 CsvReader::new(Cursor::new(include_str!("cats.csv"))).finish()
753}
754
755/// # Heat Evolved by Setting Cements
756///
757/// ## Description:
758///
759/// Experiment on the heat evolved in the setting of each of 13
760/// cements.
761///
762/// ## Usage:
763///
764/// cement
765///
766/// ## Format:
767///
768/// * ‘x1, x2, x3, x4’ Proportions (%) of active ingredients.
769/// * ‘y’ heat evolved in cals/gm.
770///
771/// ## Details:
772///
773/// Thirteen samples of Portland cement were set. For each sample, the
774/// percentages of the four main chemical ingredients was accurately
775/// measured. While the cement was setting the amount of heat evolved
776/// was also measured.
777///
778/// ## Source:
779///
780/// Woods, H., Steinour, H.H. and Starke, H.R. (1932) Effect of
781/// composition of Portland cement on heat evolved during hardening.
782/// _Industrial Engineering and Chemistry_, *24*, 1207-1214.
783///
784/// ## References:
785///
786/// Hald, A. (1957) _Statistical Theory with Engineering
787/// Applications._ Wiley, New York.
788///
789/// ## Examples:
790///
791/// ```r
792/// lm(y ~ x1 + x2 + x3 + x4, cement)
793/// ```
794pub fn cement() -> PolarsResult<DataFrame> {
795 CsvReader::new(Cursor::new(include_str!("cement.csv"))).finish()
796}
797
798/// # Copper in Wholemeal Flour
799///
800/// ## Description:
801///
802/// A numeric vector of 24 determinations of copper in wholemeal
803/// flour, in parts per million.
804///
805/// ## Usage:
806///
807/// chem
808///
809/// ## Source:
810///
811/// Analytical Methods Committee (1989) Robust statistics - how not to
812/// reject outliers. _The Analyst_ *114*, 1693-1702.
813///
814/// ## References:
815///
816/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
817/// Statistics with S._ Fourth edition. Springer.
818pub fn chem() -> PolarsResult<DataFrame> {
819 CsvReader::new(Cursor::new(include_str!("chem.csv"))).finish()
820}
821
822/// # Co-operative Trial in Analytical Chemistry
823///
824/// ## Description:
825///
826/// Seven specimens were sent to 6 laboratories in 3 separate batches
827/// and each analysed for Analyte. Each analysis was duplicated.
828///
829/// ## Usage:
830///
831/// coop
832///
833/// ## Format:
834///
835/// This data frame contains the following columns:
836///
837/// * ‘Lab’ Laboratory, ‘L1’, ‘L2’, ..., ‘L6’.
838/// * ‘Spc’ Specimen, ‘S1’, ‘S2’, ..., ‘S7’.
839/// * ‘Bat’ Batch, ‘B1’, ‘B2’, ‘B3’ (nested within ‘Spc/Lab’),
840/// * ‘Conc’ Concentration of Analyte in g/kg.
841///
842/// ## Source:
843///
844/// Analytical Methods Committee (1987) Recommendations for the
845/// conduct and interpretation of co-operative trials, _The Analyst_
846/// *112*, 679-686.
847///
848/// ## References:
849///
850/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
851/// Statistics with S._ Fourth edition. Springer.
852///
853/// ## See Also:
854///
855/// ‘chem’, ‘abbey’.
856pub fn coop() -> PolarsResult<DataFrame> {
857 CsvReader::new(Cursor::new(include_str!("coop.csv"))).finish()
858}
859
860/// # Performance of Computer CPUs
861///
862/// ## Description:
863///
864/// A relative performance measure and characteristics of 209 CPUs.
865///
866/// ## Usage:
867///
868/// cpus
869///
870/// ## Format:
871///
872/// The components are:
873///
874/// * ‘name’ manufacturer and model.
875/// * ‘syct’ cycle time in nanoseconds.
876/// * ‘mmin’ minimum main memory in kilobytes.
877/// * ‘mmax’ maximum main memory in kilobytes.
878/// * ‘cach’ cache size in kilobytes.
879/// * ‘chmin’ minimum number of channels.
880/// * ‘chmax’ maximum number of channels.
881/// * ‘perf’ published performance on a benchmark mix relative to an IBM
882/// 370/158-3.
883/// * ‘estperf’ estimated performance (by Ein-Dor & Feldmesser).
884///
885/// ## Source:
886///
887/// P. Ein-Dor and J. Feldmesser (1987) Attributes of the performance
888/// of central processing units: a relative performance prediction
889/// model. _Comm. ACM._ *30*, 308-317.
890///
891/// ## References:
892///
893/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
894/// Statistics with S._ Fourth edition. Springer.
895pub fn cpus() -> PolarsResult<DataFrame> {
896 CsvReader::new(Cursor::new(include_str!("cpus.csv"))).finish()
897}
898
899const CRABS: &'static str = include_str!("crabs.csv");
900
901/// # Morphological Measurements on Leptograpsus Crabs
902///
903/// ## Description:
904///
905/// The ‘crabs’ data frame has 200 rows and 8 columns, describing 5
906/// morphological measurements on 50 crabs each of two colour forms
907/// and both sexes, of the species _Leptograpsus variegatus_ collected
908/// at Fremantle, W. Australia.
909///
910/// ## Usage:
911///
912/// crabs
913///
914/// ## Format:
915///
916/// This data frame contains the following columns:
917///
918/// * ‘sp’ ‘species’ - ‘"B"’ or ‘"O"’ for blue or orange.
919/// * ‘sex’ as it says.
920/// * ‘index’ index ‘1:50’ within each of the four groups.
921/// * ‘FL’ frontal lobe size (mm).
922/// * ‘RW’ rear width (mm).
923/// * ‘CL’ carapace length (mm).
924/// * ‘CW’ carapace width (mm).
925/// * ‘BD’ body depth (mm).
926///
927/// ## Source:
928///
929/// Campbell, N.A. and Mahon, R.J. (1974) A multivariate study of
930/// variation in two species of rock crab of genus _Leptograpsus._
931/// _Australian Journal of Zoology_ *22*, 417-425.
932///
933/// ## References:
934///
935/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
936/// Statistics with S._ Fourth edition. Springer.
937pub fn crabs() -> PolarsResult<DataFrame> {
938 CsvReader::new(Cursor::new(CRABS)).finish()
939}
940
941/// # Diagnostic Tests on Patients with Cushing's Syndrome
942///
943/// ## Description:
944///
945/// Cushing's syndrome is a hypertensive disorder associated with
946/// over-secretion of cortisol by the adrenal gland. The observations
947/// are urinary excretion rates of two steroid metabolites.
948///
949/// ## Usage:
950///
951/// Cushings
952///
953/// ## Format:
954///
955/// The ‘Cushings’ data frame has 27 rows and 3 columns:
956///
957/// * ‘Tetrahydrocortisone’ urinary excretion rate (mg/24hr) of
958/// Tetrahydrocortisone.
959/// * ‘Pregnanetriol’ urinary excretion rate (mg/24hr) of Pregnanetriol.
960/// * ‘Type’ underlying type of syndrome, coded ‘a’ (adenoma) , ‘b’
961/// (bilateral hyperplasia), ‘c’ (carcinoma) or ‘u’ for unknown.
962///
963/// ## Source:
964///
965/// J. Aitchison and I. R. Dunsmore (1975) _Statistical Prediction
966/// Analysis._ Cambridge University Press, Tables 11.1-3.
967///
968/// ## References:
969///
970/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
971/// Statistics with S._ Fourth edition. Springer.
972pub fn cushings() -> PolarsResult<DataFrame> {
973 CsvReader::new(Cursor::new(include_str!("Cushings.csv"))).finish()
974}
975
976/// # DDT in Kale
977///
978/// ## Description:
979///
980/// A numeric vector of 15 measurements by different laboratories of
981/// the pesticide DDT in kale, in ppm (parts per million) using the
982/// multiple pesticide residue measurement.
983///
984/// ## Usage:
985///
986/// DDT
987///
988/// ## Source:
989///
990/// C. E. Finsterwalder (1976) Collaborative study of an extension of
991/// the Mills _et al_ method for the determination of pesticide
992/// residues in food. _J. Off. Anal. Chem._ *59*, 169-171
993///
994/// R. G. Staudte and S. J. Sheather (1990) _Robust Estimation and
995/// Testing._ Wiley
996pub fn ddt() -> PolarsResult<DataFrame> {
997 CsvReader::new(Cursor::new(include_str!("DDT.csv"))).finish()
998}
999
1000/// # Monthly Deaths from Lung Diseases in the UK
1001///
1002/// ## Description:
1003///
1004/// A time series giving the monthly deaths from bronchitis, emphysema
1005/// and asthma in the UK, 1974-1979, both sexes (‘deaths’),
1006///
1007/// ## Usage:
1008///
1009/// deaths
1010///
1011/// ## Source:
1012///
1013/// P. J. Diggle (1990) _Time Series: A Biostatistical Introduction._
1014/// Oxford, table A.3
1015///
1016/// ## References:
1017///
1018/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
1019/// Statistics with S._ Fourth edition. Springer.
1020///
1021/// ## See Also:
1022///
1023/// This the same as dataset ‘ldeaths’ in R's ‘datasets’ package.
1024pub fn deaths() -> PolarsResult<DataFrame> {
1025 CsvReader::new(Cursor::new(include_str!("deaths.csv"))).finish()
1026}
1027
1028/// # Deaths of Car Drivers in Great Britain 1969-84
1029///
1030/// ## Description:
1031///
1032/// A regular time series giving the monthly totals of car drivers in
1033/// Great Britain killed or seriously injured Jan 1969 to Dec 1984.
1034/// Compulsory wearing of seat belts was introduced on 31 Jan 1983
1035///
1036/// ## Usage:
1037///
1038/// drivers
1039///
1040/// ## Source:
1041///
1042/// Harvey, A.C. (1989) _Forecasting, Structural Time Series Models
1043/// and the Kalman Filter._ Cambridge University Press, pp. 519-523.
1044///
1045/// ## References:
1046///
1047/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
1048/// Statistics with S-PLUS._ Fourth Edition. Springer.
1049pub fn drivers() -> PolarsResult<DataFrame> {
1050 CsvReader::new(Cursor::new(include_str!("drivers.csv"))).finish()
1051}
1052
1053/// # Foraging Ecology of Bald Eagles
1054///
1055/// ## Description:
1056///
1057/// Knight and Skagen collected during a field study on the foraging
1058/// behaviour of wintering Bald Eagles in Washington State, USA data
1059/// concerning 160 attempts by one (pirating) Bald Eagle to steal a
1060/// chum salmon from another (feeding) Bald Eagle.
1061///
1062/// ## Usage:
1063///
1064/// eagles
1065///
1066/// ## Format:
1067///
1068/// The ‘eagles’ data frame has 8 rows and 5 columns.
1069///
1070/// * ‘y’ Number of successful attempts.
1071/// * ‘n’ Total number of attempts.
1072/// * ‘P’ Size of pirating eagle (‘L’ = large, ‘S’ = small).
1073/// * ‘A’ Age of pirating eagle (‘I’ = immature, ‘A’ = adult).
1074/// * ‘V’ Size of victim eagle (‘L’ = large, ‘S’ = small).
1075///
1076/// ## Source:
1077///
1078/// Knight, R. L. and Skagen, S. K. (1988) Agonistic asymmetries and
1079/// the foraging ecology of Bald Eagles. _Ecology_ *69*, 1188-1194.
1080///
1081/// ## References:
1082///
1083/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
1084/// Statistics with S-PLUS._ Fourth Edition. Springer.
1085///
1086/// ## Examples:
1087///
1088/// ```r
1089/// eagles.glm <- glm(cbind(y, n - y) ~ P*A + V, data = eagles,
1090/// family = binomial)
1091/// dropterm(eagles.glm)
1092/// prof <- profile(eagles.glm)
1093/// plot(prof)
1094/// pairs(prof)
1095/// ```
1096pub fn eagles() -> PolarsResult<DataFrame> {
1097 CsvReader::new(Cursor::new(include_str!("eagles.csv"))).finish()
1098}
1099
1100/// # Seizure Counts for Epileptics
1101///
1102/// ## Description:
1103///
1104/// Thall and Vail (1990) give a data set on two-week seizure counts
1105/// for 59 epileptics. The number of seizures was recorded for a
1106/// baseline period of 8 weeks, and then patients were randomly
1107/// assigned to a treatment group or a control group. Counts were
1108/// then recorded for four successive two-week periods. The subject's
1109/// age is the only covariate.
1110///
1111/// ## Usage:
1112///
1113/// epil
1114///
1115/// ## Format:
1116///
1117/// This data frame has 236 rows and the following 9 columns:
1118///
1119/// * ‘y’ the count for the 2-week period.
1120/// * ‘trt’ treatment, ‘"placebo"’ or ‘"progabide"’.
1121/// * ‘base’ the counts in the baseline 8-week period.
1122/// * ‘age’ subject's age, in years.
1123/// * ‘V4’ ‘0/1’ indicator variable of period 4.
1124/// * ‘subject’ subject number, 1 to 59.
1125/// * ‘period’ period, 1 to 4.
1126/// * ‘lbase’ log-counts for the baseline period, centred to have zero
1127/// mean.
1128/// * ‘lage’ log-ages, centred to have zero mean.
1129///
1130/// ## Source:
1131///
1132/// Thall, P. F. and Vail, S. C. (1990) Some covariance models for
1133/// longitudinal count data with over-dispersion. _Biometrics_ *46*,
1134/// 657-671.
1135///
1136/// ## References:
1137///
1138/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
1139/// Statistics with S._ Fourth Edition. Springer.
1140///
1141/// ## Examples:
1142///
1143/// ```r
1144/// ## IGNORE_RDIFF_BEGIN
1145/// summary(glm(y ~ lbase*trt + lage + V4, family = poisson,
1146/// data = epil), correlation = FALSE)
1147/// ## IGNORE_RDIFF_END
1148/// epil2 <- epil[epil$period == 1, ]
1149/// epil2["period"] <- rep(0, 59); epil2["y"] <- epil2["base"]
1150/// epil["time"] <- 1; epil2["time"] <- 4
1151/// epil2 <- rbind(epil, epil2)
1152/// epil2$pred <- unclass(epil2$trt) * (epil2$period > 0)
1153/// epil2$subject <- factor(epil2$subject)
1154/// epil3 <- aggregate(epil2, list(epil2$subject, epil2$period > 0),
1155/// function(x) if(is.numeric(x)) sum(x) else x[1])
1156/// epil3$pred <- factor(epil3$pred,
1157/// labels = c("base", "placebo", "drug"))
1158///
1159/// contrasts(epil3$pred) <- structure(contr.sdif(3),
1160/// dimnames = list(NULL, c("placebo-base", "drug-placebo")))
1161/// ## IGNORE_RDIFF_BEGIN
1162/// summary(glm(y ~ pred + factor(subject) + offset(log(time)),
1163/// family = poisson, data = epil3), correlation = FALSE)
1164/// ## IGNORE_RDIFF_END
1165///
1166/// summary(glmmPQL(y ~ lbase*trt + lage + V4,
1167/// random = ~ 1 | subject,
1168/// family = poisson, data = epil))
1169/// summary(glmmPQL(y ~ pred, random = ~1 | subject,
1170/// family = poisson, data = epil3))
1171/// ```
1172pub fn epil() -> PolarsResult<DataFrame> {
1173 CsvReader::new(Cursor::new(include_str!("epil.csv"))).finish()
1174}
1175
1176/// # Ecological Factors in Farm Management
1177///
1178/// ## Description:
1179///
1180/// The ‘farms’ data frame has 20 rows and 4 columns. The rows are
1181/// farms on the Dutch island of Terschelling and the columns are
1182/// factors describing the management of grassland.
1183///
1184/// ## Usage:
1185///
1186/// farms
1187///
1188/// ## Format:
1189///
1190/// This data frame contains the following columns:
1191///
1192/// * ‘Mois’ Five levels of soil moisture - level 3 does not occur at
1193/// these 20 farms.
1194/// * ‘Manag’ Grassland management type (‘SF’ = standard, ‘BF’ =
1195/// biological, ‘HF’ = hobby farming, ‘NM’ = nature
1196/// conservation).
1197/// * ‘Use’ Grassland use (‘U1’ = hay production, ‘U2’ = intermediate,
1198/// ‘U3’ = grazing).
1199/// * ‘Manure’ Manure usage - classes ‘C0’ to ‘C4’.
1200///
1201/// ## Source:
1202///
1203/// J.C. Gower and D.J. Hand (1996) _Biplots_. Chapman & Hall, Table
1204/// 4.6.
1205///
1206/// ## Quoted as from:
1207/// R.H.G. Jongman, C.J.F. ter Braak and O.F.R. van Tongeren (1987)
1208/// _Data Analysis in Community and Landscape Ecology._ PUDOC,
1209/// Wageningen.
1210///
1211/// ## References:
1212///
1213/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
1214/// Statistics with S._ Fourth edition. Springer.
1215///
1216/// ## Examples:
1217///
1218/// ```r
1219/// farms.mca <- mca(farms, abbrev = TRUE) # Use levels as names
1220/// eqscplot(farms.mca$cs, type = "n")
1221/// text(farms.mca$rs, cex = 0.7)
1222/// text(farms.mca$cs, labels = dimnames(farms.mca$cs)[[1]], cex = 0.7)
1223/// ```
1224pub fn farms() -> PolarsResult<DataFrame> {
1225 CsvReader::new(Cursor::new(include_str!("farms.csv"))).finish()
1226}
1227
1228/// # Measurements of Forensic Glass Fragments
1229///
1230/// ## Description:
1231///
1232/// The ‘fgl’ data frame has 214 rows and 10 columns. It was collected
1233/// by B. German on fragments of glass collected in forensic work.
1234///
1235/// ## Usage:
1236///
1237/// fgl
1238///
1239/// ## Format:
1240///
1241/// This data frame contains the following columns:
1242///
1243/// * ‘RI’ refractive index; more precisely the refractive index is
1244/// 1.518xxxx.
1245///
1246/// The next 8 measurements are percentages by weight of oxides.
1247///
1248/// * ‘Na’ sodium.
1249/// * ‘Mg’ manganese.
1250/// * ‘Al’ aluminium.
1251/// * ‘Si’ silicon.
1252/// * ‘K’ potassium.
1253/// * ‘Ca’ calcium.
1254/// * ‘Ba’ barium.
1255/// * ‘Fe’ iron.
1256/// * ‘type’ The fragments were originally classed into seven types, one
1257/// of which was absent in this dataset. The categories which
1258/// occur are window float glass (‘WinF’: 70), window non-float
1259/// glass (‘WinNF’: 76), vehicle window glass (‘Veh’: 17),
1260/// containers (‘Con’: 13), tableware (‘Tabl’: 9) and vehicle
1261/// headlamps (‘Head’: 29).
1262///
1263/// ## References:
1264///
1265/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
1266/// Statistics with S._ Fourth edition. Springer.
1267pub fn fgl() -> PolarsResult<DataFrame> {
1268 CsvReader::new(Cursor::new(include_str!("fgl.csv"))).finish()
1269}
1270
1271/// # Forbes' Data on Boiling Points in the Alps
1272///
1273/// ## Description:
1274///
1275/// A data frame with 17 observations on boiling point of water and
1276/// barometric pressure in inches of mercury.
1277///
1278/// ## Usage:
1279///
1280/// forbes
1281///
1282/// ## Format:
1283///
1284/// * ‘bp’ boiling point (degrees Farenheit).
1285/// * ‘pres’ barometric pressure in inches of mercury.
1286///
1287/// ## Source:
1288///
1289/// A. C. Atkinson (1985) _Plots, Transformations and Regression._
1290/// Oxford.
1291///
1292/// S. Weisberg (1980) _Applied Linear Regression._ Wiley.
1293pub fn forbes() -> PolarsResult<DataFrame> {
1294 CsvReader::new(Cursor::new(include_str!("forbes.csv"))).finish()
1295}
1296
1297/// # Level of GAG in Urine of Children
1298///
1299/// ## Description:
1300///
1301/// Data were collected on the concentration of a chemical GAG in the
1302/// urine of 314 children aged from zero to seventeen years. The aim
1303/// of the study was to produce a chart to help a paediatrican to
1304/// assess if a child's GAG concentration is ‘normal’.
1305///
1306/// ## Usage:
1307///
1308/// GAGurine
1309///
1310/// ## Format:
1311///
1312/// This data frame contains the following columns:
1313///
1314/// * ‘Age’ age of child in years.
1315/// * ‘GAG’ concentration of GAG (the units have been lost).
1316///
1317/// ## Source:
1318///
1319/// Mrs Susan Prosser, Paediatrics Department, University of Oxford,
1320/// via Department of Statistics Consulting Service.
1321///
1322/// ## References:
1323///
1324/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
1325/// Statistics with S._ Fourth edition. Springer.
1326pub fn gagurine() -> PolarsResult<DataFrame> {
1327 CsvReader::new(Cursor::new(include_str!("GAGurine.csv"))).finish()
1328}
1329
1330/// # Velocities for 82 Galaxies
1331///
1332/// ## Description:
1333///
1334/// A numeric vector of velocities in km/sec of 82 galaxies from 6
1335/// well-separated conic sections of an ‘unfilled’ survey of the
1336/// Corona Borealis region. Multimodality in such surveys is evidence
1337/// for voids and superclusters in the far universe.
1338///
1339/// ## Usage:
1340///
1341/// galaxies
1342///
1343/// ## Note:
1344///
1345/// There is an 83rd measurement of 5607 km/sec in the Postman _et
1346/// al._ paper which is omitted in Roeder (1990) and from the dataset
1347/// here.
1348///
1349/// There is also a typo: this dataset has 78th observation 26690
1350/// which should be 26960.
1351///
1352/// ## Source:
1353///
1354/// Roeder, K. (1990) Density estimation with confidence sets
1355/// exemplified by superclusters and voids in galaxies. _Journal of
1356/// the American Statistical Association_ *85*, 617-624.
1357///
1358/// Postman, M., Huchra, J. P. and Geller, M. J. (1986) Probes of
1359/// large-scale structures in the Corona Borealis region.
1360/// _Astronomical Journal_ *92*, 1238-1247.
1361///
1362/// ## References:
1363///
1364/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
1365/// Statistics with S._ Fourth edition. Springer.
1366///
1367/// ## Examples:
1368///
1369/// ```r
1370/// gal <- galaxies/1000
1371/// c(width.SJ(gal, method = "dpi"), width.SJ(gal))
1372/// plot(x = c(0, 40), y = c(0, 0.3), type = "n", bty = "l",
1373/// xlab = "velocity of galaxy (1000km/s)", ylab = "density")
1374/// rug(gal)
1375/// lines(density(gal, width = 3.25, n = 200), lty = 1)
1376/// lines(density(gal, width = 2.56, n = 200), lty = 3)
1377/// ```
1378pub fn galaxies() -> PolarsResult<DataFrame> {
1379 CsvReader::new(Cursor::new(include_str!("galaxies.csv"))).finish()
1380}
1381
1382/// # Remission Times of Leukaemia Patients
1383///
1384/// ## Description:
1385///
1386/// A data frame from a trial of 42 leukaemia patients. Some were
1387/// treated with the drug _6-mercaptopurine_ and the rest are
1388/// controls. The trial was designed as matched pairs, both withdrawn
1389/// from the trial when either came out of remission.
1390///
1391/// ## Usage:
1392///
1393/// gehan
1394///
1395/// ## Format:
1396///
1397/// This data frame contains the following columns:
1398///
1399/// * ‘pair’ label for pair.
1400/// * ‘time’ remission time in weeks.
1401/// * ‘cens’ censoring, 0/1.
1402/// * ‘treat’ treatment, control or 6-MP.
1403///
1404/// ## Source:
1405///
1406/// Cox, D. R. and Oakes, D. (1984) _Analysis of Survival Data._
1407/// Chapman & Hall, p. 7. Taken from
1408///
1409/// Gehan, E.A. (1965) A generalized Wilcoxon test for comparing
1410/// arbitrarily single-censored samples. _Biometrika_ *52*, 203-233.
1411///
1412/// ## References:
1413///
1414/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
1415/// Statistics with S._ Fourth edition. Springer.
1416///
1417/// ## Examples:
1418///
1419/// ```r
1420/// library(survival)
1421/// gehan.surv <- survfit(Surv(time, cens) ~ treat, data = gehan,
1422/// conf.type = "log-log")
1423/// summary(gehan.surv)
1424/// survreg(Surv(time, cens) ~ factor(pair) + treat, gehan, dist = "exponential")
1425/// summary(survreg(Surv(time, cens) ~ treat, gehan, dist = "exponential"))
1426/// summary(survreg(Surv(time, cens) ~ treat, gehan))
1427/// gehan.cox <- coxph(Surv(time, cens) ~ treat, gehan)
1428/// summary(gehan.cox)
1429/// ```
1430pub fn gehan() -> PolarsResult<DataFrame> {
1431 CsvReader::new(Cursor::new(include_str!("gehan.csv"))).finish()
1432}
1433
1434/// # Rat Genotype Data
1435///
1436/// ## Description:
1437///
1438/// Data from a foster feeding experiment with rat mothers and litters
1439/// of four different genotypes: ‘A’, ‘B’, ‘I’ and ‘J’. Rat litters
1440/// were separated from their natural mothers at birth and given to
1441/// foster mothers to rear.
1442///
1443/// ## Usage:
1444///
1445/// genotype
1446///
1447/// ## Format:
1448///
1449/// The data frame has the following components:
1450///
1451/// * ‘Litter’ genotype of the litter.
1452/// * ‘Mother’ genotype of the foster mother.
1453/// * ‘Wt’ Litter average weight gain of the litter, in grams at age 28
1454/// days. (The source states that the within-litter variability
1455/// is negligible.)
1456///
1457/// ## Source:
1458///
1459/// Scheffe, H. (1959) _The Analysis of Variance_ Wiley p. 140.
1460///
1461/// Bailey, D. W. (1953) _The Inheritance of Maternal Influences on
1462/// the Growth of the Rat._ Unpublished Ph.D. thesis, University of
1463/// California. Table B of the Appendix.
1464///
1465/// ## References:
1466///
1467/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
1468/// Statistics with S-PLUS._ Fourth Edition. Springer.
1469pub fn genotype() -> PolarsResult<DataFrame> {
1470 CsvReader::new(Cursor::new(include_str!("genotype.csv"))).finish()
1471}
1472
1473/// # Old Faithful Geyser Data
1474///
1475/// # Description:
1476///
1477/// A version of the eruptions data from the ‘Old Faithful’ geyser in
1478/// Yellowstone National Park, Wyoming. This version comes from
1479/// Azzalini and Bowman (1990) and is of continuous measurement from
1480/// August 1 to August 15, 1985.
1481///
1482/// Some nocturnal duration measurements were coded as 2, 3 or 4
1483/// minutes, having originally been described as ‘short’, ‘medium’ or
1484/// ‘long’.
1485///
1486/// # Usage:
1487///
1488/// geyser
1489///
1490/// # Format:
1491///
1492/// A data frame with 299 observations on 2 variables.
1493///
1494/// * ‘duration’ numeric Eruption time in mins
1495/// * ‘waiting’numeric Waiting time for this eruption
1496///
1497/// ## Note:
1498///
1499/// The ‘waiting’ time was incorrectly described as the time to the
1500/// next eruption in the original files, and corrected for ‘MASS’
1501/// version 7.3-30.
1502///
1503/// ## References:
1504///
1505/// Azzalini, A. and Bowman, A. W. (1990) A look at some data on the
1506/// Old Faithful geyser. _Applied Statistics_ *39*, 357-365.
1507///
1508/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
1509/// Statistics with S._ Fourth edition. Springer.
1510///
1511/// ## See Also:
1512///
1513/// ‘faithful’.
1514///
1515/// CRAN package ‘sm’.
1516pub fn geyser() -> PolarsResult<DataFrame> {
1517 CsvReader::new(Cursor::new(include_str!("geyser.csv"))).finish()
1518}
1519
1520/// # Line Transect of Soil in Gilgai Territory
1521///
1522/// ## Description:
1523///
1524/// This dataset was collected on a line transect survey in gilgai
1525/// territory in New South Wales, Australia. Gilgais are natural
1526/// gentle depressions in otherwise flat land, and sometimes seem to
1527/// be regularly distributed. The data collection was stimulated by
1528/// the question: are these patterns reflected in soil properties? At
1529/// each of 365 sampling locations on a linear grid of 4 meters
1530/// spacing, samples were taken at depths 0-10 cm, 30-40 cm and 80-90
1531/// cm below the surface. pH, electrical conductivity and chloride
1532/// content were measured on a 1:5 soil:water extract from each
1533/// sample.
1534///
1535/// ## Usage:
1536///
1537/// gilgais
1538///
1539/// ## Format:
1540///
1541/// This data frame contains the following columns:
1542///
1543/// * ‘pH00’ pH at depth 0-10 cm.
1544/// * ‘pH30’ pH at depth 30-40 cm.
1545/// * ‘pH80’ pH at depth 80-90 cm.
1546/// * ‘e00’ electrical conductivity in mS/cm (0-10 cm).
1547/// * ‘e30’ electrical conductivity in mS/cm (30-40 cm).
1548/// * ‘e80’ electrical conductivity in mS/cm (80-90 cm).
1549/// * ‘c00’ chloride content in ppm (0-10 cm).
1550/// * ‘c30’ chloride content in ppm (30-40 cm).
1551/// * ‘c80’ chloride content in ppm (80-90 cm).
1552///
1553/// ## Source:
1554///
1555/// Webster, R. (1977) Spectral analysis of gilgai soil. _Australian
1556/// Journal of Soil Research_ *15*, 191-204.
1557///
1558/// Laslett, G. M. (1989) Kriging and splines: An empirical comparison
1559/// of their predictive performance in some applications (with
1560/// discussion). _Journal of the American Statistical Association_
1561/// *89*, 319-409
1562///
1563/// ## References:
1564///
1565/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
1566/// Statistics with S._ Fourth edition. Springer.
1567pub fn gilgais() -> PolarsResult<DataFrame> {
1568 CsvReader::new(Cursor::new(include_str!("gilgais.csv"))).finish()
1569}
1570
1571/// # Record Times in Scottish Hill Races
1572///
1573/// ## Description:
1574///
1575/// The record times in 1984 for 35 Scottish hill races.
1576///
1577/// ## Usage:
1578///
1579/// hills
1580///
1581/// ## Format:
1582///
1583/// The components are:
1584///
1585/// * ‘dist’ distance in miles (on the map).
1586/// * ‘climb’ total height gained during the route, in feet.
1587/// * ‘time’ record time in minutes.
1588///
1589/// ## Source:
1590///
1591/// A.C. Atkinson (1986) Comment: Aspects of diagnostic regression
1592/// analysis. _Statistical Science_ *1*, 397-402.
1593///
1594/// [A.C. Atkinson (1988) Transformations unmasked. _Technometrics_
1595/// *30*, 311-318 “corrects” the time for Knock Hill from 78.65 to
1596/// 18.65. It is unclear if this based on the original records.]
1597///
1598/// ## References:
1599///
1600/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
1601/// Statistics with S._ Fourth edition. Springer.
1602pub fn hills() -> PolarsResult<DataFrame> {
1603 CsvReader::new(Cursor::new(include_str!("hills.csv"))).finish()
1604}
1605
1606/// # Frequency Table from a Copenhagen Housing Conditions Survey
1607///
1608/// ## Description:
1609///
1610/// The ‘housing’ data frame has 72 rows and 5 variables.
1611///
1612/// ## Usage:
1613///
1614/// housing
1615///
1616/// ## Format:
1617///
1618/// * ‘Sat’ Satisfaction of householders with their present housing
1619/// circumstances, (High, Medium or Low, ordered factor).
1620/// * ‘Infl’ Perceived degree of influence householders have on the
1621/// management of the property (High, Medium, Low).
1622/// * ‘Type’ Type of rental accommodation, (Tower, Atrium, Apartment,
1623/// Terrace).
1624/// * ‘Cont’ Contact residents are afforded with other residents, (Low,
1625/// High).
1626/// * ‘Freq’ Frequencies: the numbers of residents in each class.
1627///
1628/// ## Source:
1629///
1630/// Madsen, M. (1976) Statistical analysis of multiple contingency
1631/// tables. Two examples. _Scand. J. Statist._ *3*, 97-106.
1632///
1633/// Cox, D. R. and Snell, E. J. (1984) _Applied Statistics, Principles
1634/// and Examples_. Chapman & Hall.
1635///
1636/// ## References:
1637///
1638/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
1639/// Statistics with S._ Fourth edition. Springer.
1640///
1641/// ## Examples:
1642///
1643/// ```r
1644/// options(contrasts = c("contr.treatment", "contr.poly"))
1645///
1646/// # Surrogate Poisson models
1647/// house.glm0 <- glm(Freq ~ Infl*Type*Cont + Sat, family = poisson,
1648/// data = housing)
1649/// ## IGNORE_RDIFF_BEGIN
1650/// summary(house.glm0, correlation = FALSE)
1651/// ## IGNORE_RDIFF_END
1652///
1653/// addterm(house.glm0, ~. + Sat:(Infl+Type+Cont), test = "Chisq")
1654///
1655/// house.glm1 <- update(house.glm0, . ~ . + Sat*(Infl+Type+Cont))
1656/// ## IGNORE_RDIFF_BEGIN
1657/// summary(house.glm1, correlation = FALSE)
1658/// ## IGNORE_RDIFF_END
1659///
1660/// 1 - pchisq(deviance(house.glm1), house.glm1$df.residual)
1661///
1662/// dropterm(house.glm1, test = "Chisq")
1663///
1664/// addterm(house.glm1, ~. + Sat:(Infl+Type+Cont)^2, test = "Chisq")
1665///
1666/// hnames <- lapply(housing[, -5], levels) # omit Freq
1667/// newData <- expand.grid(hnames)
1668/// newData$Sat <- ordered(newData$Sat)
1669/// house.pm <- predict(house.glm1, newData,
1670/// type = "response") # poisson means
1671/// house.pm <- matrix(house.pm, ncol = 3, byrow = TRUE,
1672/// dimnames = list(NULL, hnames[[1]]))
1673/// house.pr <- house.pm/drop(house.pm %*% rep(1, 3))
1674/// cbind(expand.grid(hnames[-1]), round(house.pr, 2))
1675///
1676/// # Iterative proportional scaling
1677/// loglm(Freq ~ Infl*Type*Cont + Sat*(Infl+Type+Cont), data = housing)
1678///
1679///
1680/// # multinomial model
1681/// library(nnet)
1682/// (house.mult<- multinom(Sat ~ Infl + Type + Cont, weights = Freq,
1683/// data = housing))
1684/// house.mult2 <- multinom(Sat ~ Infl*Type*Cont, weights = Freq,
1685/// data = housing)
1686/// anova(house.mult, house.mult2)
1687///
1688/// house.pm <- predict(house.mult, expand.grid(hnames[-1]), type = "probs")
1689/// cbind(expand.grid(hnames[-1]), round(house.pm, 2))
1690///
1691/// # proportional odds model
1692/// house.cpr <- apply(house.pr, 1, cumsum)
1693/// logit <- function(x) log(x/(1-x))
1694/// house.ld <- logit(house.cpr[2, ]) - logit(house.cpr[1, ])
1695/// (ratio <- sort(drop(house.ld)))
1696/// mean(ratio)
1697///
1698/// (house.plr <- polr(Sat ~ Infl + Type + Cont,
1699/// data = housing, weights = Freq))
1700///
1701/// house.pr1 <- predict(house.plr, expand.grid(hnames[-1]), type = "probs")
1702/// cbind(expand.grid(hnames[-1]), round(house.pr1, 2))
1703///
1704/// Fr <- matrix(housing$Freq, ncol = 3, byrow = TRUE)
1705/// 2*sum(Fr*log(house.pr/house.pr1))
1706///
1707/// house.plr2 <- stepAIC(house.plr, ~.^2)
1708/// house.plr2$anova
1709/// ```
1710pub fn housing() -> PolarsResult<DataFrame> {
1711 CsvReader::new(Cursor::new(include_str!("housing.csv"))).finish()
1712}
1713
1714/// # Yields from a Barley Field Trial
1715///
1716/// ## Description:
1717///
1718/// The ‘immer’ data frame has 30 rows and 4 columns. Five varieties
1719/// of barley were grown in six locations in each of 1931 and 1932.
1720///
1721/// ## Usage:
1722///
1723/// immer
1724///
1725/// ## Format:
1726///
1727/// This data frame contains the following columns:
1728///
1729/// * ‘Loc’ The location.
1730/// * ‘Var’ The variety of barley (‘"manchuria"’, ‘"svansota"’,
1731/// ‘"velvet"’, ‘"trebi"’ and ‘"peatland"’).
1732/// * ‘Y1’ Yield in 1931.
1733/// * ‘Y2’ Yield in 1932.
1734///
1735/// ## Source:
1736///
1737/// Immer, F.R., Hayes, H.D. and LeRoy Powers (1934) Statistical
1738/// determination of barley varietal adaptation. _Journal of the
1739/// American Society for Agronomy_ *26*, 403-419.
1740///
1741/// Fisher, R.A. (1947) _The Design of Experiments._ 4th edition.
1742/// Edinburgh: Oliver and Boyd.
1743///
1744/// ## References:
1745///
1746/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
1747/// Statistics with S-PLUS._ Fourth Edition. Springer.
1748///
1749/// ## Examples:
1750///
1751/// ```r
1752/// immer.aov <- aov(cbind(Y1,Y2) ~ Loc + Var, data = immer)
1753/// summary(immer.aov)
1754///
1755/// immer.aov <- aov((Y1+Y2)/2 ~ Var + Loc, data = immer)
1756/// summary(immer.aov)
1757/// model.tables(immer.aov, type = "means", se = TRUE, cterms = "Var")
1758/// ```
1759pub fn immer() -> PolarsResult<DataFrame> {
1760 CsvReader::new(Cursor::new(include_str!("immer.csv"))).finish()
1761}
1762
1763/// # Numbers of Car Insurance claims
1764///
1765/// ## Description:
1766///
1767/// The data given in data frame ‘Insurance’ consist of the numbers of
1768/// policyholders of an insurance company who were exposed to risk,
1769/// and the numbers of car insurance claims made by those
1770/// policyholders in the third quarter of 1973.
1771///
1772/// ## Usage:
1773///
1774/// Insurance
1775///
1776/// ## Format:
1777///
1778/// This data frame contains the following columns:
1779///
1780/// * ‘District’ factor: district of residence of policyholder (1 to 4):
1781/// 4 is major cities.
1782/// * ‘Group’ an ordered factor: group of car with levels <1 litre,
1783/// 1-1.5 litre, 1.5-2 litre, >2 litre.
1784/// * ‘Age’ an ordered factor: the age of the insured in 4 groups
1785/// labelled <25, 25-29, 30-35, >35.
1786/// * ‘Holders’ numbers of policyholders.
1787/// * ‘Claims’ numbers of claims
1788///
1789/// ## Source:
1790///
1791/// L. A. Baxter, S. M. Coutts and G. A. F. Ross (1980) Applications
1792/// of linear models in motor insurance. _Proceedings of the 21st
1793/// International Congress of Actuaries, Zurich_ pp. 11-29.
1794///
1795/// M. Aitkin, D. Anderson, B. Francis and J. Hinde (1989)
1796/// _Statistical Modelling in GLIM._ Oxford University Press.
1797///
1798/// ## References:
1799///
1800/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
1801/// Statistics with S-PLUS._ Fourth Edition. Springer.
1802///
1803/// ## Examples:
1804///
1805/// ```r
1806/// ## main-effects fit as Poisson GLM with offset
1807/// glm(Claims ~ District + Group + Age + offset(log(Holders)),
1808/// data = Insurance, family = poisson)
1809///
1810/// # same via loglm
1811/// loglm(Claims ~ District + Group + Age + offset(log(Holders)),
1812/// data = Insurance)
1813/// ```
1814pub fn insurance() -> PolarsResult<DataFrame> {
1815 CsvReader::new(Cursor::new(include_str!("Insurance.csv"))).finish()
1816}
1817
1818/// # Survival Times and White Blood Counts for Leukaemia Patients
1819///
1820/// ## Description:
1821///
1822/// A data frame of data from 33 leukaemia patients.
1823///
1824/// ## Usage:
1825///
1826/// leuk
1827///
1828/// ## Format:
1829///
1830/// A data frame with columns:
1831///
1832/// * ‘wbc’ white blood count.
1833/// * ‘ag’ a test result, ‘"present"’ or ‘"absent"’.
1834/// * ‘time’ survival time in weeks.
1835///
1836/// ## Details:
1837///
1838/// Survival times are given for 33 patients who died from acute
1839/// myelogenous leukaemia. Also measured was the patient's white
1840/// blood cell count at the time of diagnosis. The patients were also
1841/// factored into 2 groups according to the presence or absence of a
1842/// morphologic characteristic of white blood cells. Patients termed
1843/// AG positive were identified by the presence of Auer rods and/or
1844/// significant granulation of the leukaemic cells in the bone marrow
1845/// at the time of diagnosis.
1846///
1847/// ## Source:
1848///
1849/// Cox, D. R. and Oakes, D. (1984) _Analysis of Survival Data_.
1850/// Chapman & Hall, p. 9.
1851///
1852/// ## Taken from:
1853///
1854/// Feigl, P. & Zelen, M. (1965) Estimation of exponential survival
1855/// probabilities with concomitant information. _Biometrics_ *21*,
1856/// 826-838.
1857///
1858/// ## References:
1859///
1860/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
1861/// Statistics with S._ Fourth edition. Springer.
1862///
1863/// ## Examples:
1864///
1865/// ```r
1866/// library(survival)
1867/// plot(survfit(Surv(time) ~ ag, data = leuk), lty = 2:3, col = 2:3)
1868///
1869/// # now Cox models
1870/// leuk.cox <- coxph(Surv(time) ~ ag + log(wbc), leuk)
1871/// summary(leuk.cox)
1872/// ```
1873pub fn leuk() -> PolarsResult<DataFrame> {
1874 CsvReader::new(Cursor::new(include_str!("leuk.csv"))).finish()
1875}
1876
1877/// # Brain and Body Weights for 62 Species of Land Mammals
1878///
1879/// ## Description:
1880///
1881/// A data frame with average brain and body weights for 62 species of
1882/// land mammals.
1883///
1884/// ## Usage:
1885///
1886/// mammals
1887///
1888/// ## Format:
1889///
1890/// * ‘body’ body weight in kg.
1891/// * ‘brain’ brain weight in g.
1892/// * ‘name’ Common name of species. (Rock hyrax-a = _Heterohyrax
1893/// brucci_, Rock hyrax-b = _Procavia habessinic._.)
1894///
1895/// ## Source:
1896///
1897/// Weisberg, S. (1985) _Applied Linear Regression._ 2nd edition.
1898/// Wiley, pp. 144-5.
1899///
1900/// Selected from: Allison, T. and Cicchetti, D. V. (1976) Sleep in
1901/// mammals: ecological and constitutional correlates. _Science_
1902/// *194*, 732-734.
1903///
1904/// ## References:
1905///
1906/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
1907/// Statistics with S-PLUS._ Fourth Edition. Springer.
1908pub fn mammals() -> PolarsResult<DataFrame> {
1909 CsvReader::new(Cursor::new(include_str!("mammals.csv"))).finish()
1910}
1911
1912/// # Data from a Simulated Motorcycle Accident
1913///
1914/// ## Description:
1915///
1916/// A data frame giving a series of measurements of head acceleration
1917/// in a simulated motorcycle accident, used to test crash helmets.
1918///
1919/// ## Usage:
1920///
1921/// mcycle
1922///
1923/// ## Format:
1924///
1925/// * ‘times’ in milliseconds after impact.
1926/// * ‘accel’ in g.
1927///
1928/// ## Source:
1929///
1930/// Silverman, B. W. (1985) Some aspects of the spline smoothing
1931/// approach to non-parametric curve fitting. _Journal of the Royal
1932/// Statistical Society series B_ *47*, 1-52.
1933///
1934/// ## References:
1935///
1936/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
1937/// Statistics with S-PLUS._ Fourth Edition. Springer.
1938pub fn mcycle() -> PolarsResult<DataFrame> {
1939 CsvReader::new(Cursor::new(include_str!("mcycle.csv"))).finish()
1940}
1941
1942/// # Survival from Malignant Melanoma
1943///
1944/// ## Description:
1945///
1946/// The ‘Melanoma’ data frame has data on 205 patients in Denmark with
1947/// malignant melanoma.
1948///
1949/// ## Usage:
1950///
1951/// Melanoma
1952///
1953/// ## Format:
1954///
1955/// This data frame contains the following columns:
1956///
1957/// * ‘time’ survival time in days, possibly censored.
1958/// * ‘status’ ‘1’ died from melanoma, ‘2’ alive, ‘3’ dead from other
1959/// causes.
1960/// * ‘sex’ ‘1’ = male, ‘0’ = female.
1961/// * ‘age’ age in years.
1962/// * ‘year’ of operation.
1963/// * ‘thickness’ tumour thickness in mm.
1964/// * ‘ulcer’ ‘1’ = presence, ‘0’ = absence.
1965///
1966/// ## Source:
1967///
1968/// P. K. Andersen, O. Borgan, R. D. Gill and N. Keiding (1993)
1969/// _Statistical Models based on Counting Processes._ Springer.
1970pub fn melanoma() -> PolarsResult<DataFrame> {
1971 CsvReader::new(Cursor::new(include_str!("Melanoma.csv"))).finish()
1972}
1973
1974/// # Age of Menarche in Warsaw
1975///
1976/// ## Description:
1977///
1978/// Proportions of female children at various ages during adolescence
1979/// who have reached menarche.
1980///
1981/// ## Usage:
1982///
1983/// menarche
1984///
1985/// ## Format:
1986///
1987/// This data frame contains the following columns:
1988///
1989/// * ‘Age’ Average age of the group. (The groups are reasonably age
1990/// homogeneous.)
1991/// * ‘Total’ Total number of children in the group.
1992/// * ‘Menarche’ Number who have reached menarche.
1993///
1994/// ## Source:
1995///
1996/// Milicer, H. and Szczotka, F. (1966) Age at Menarche in Warsaw
1997/// girls in 1965. _Human Biology_ *38*, 199-203.
1998///
1999/// The data are also given in
2000/// Aranda-Ordaz, F.J. (1981) On two families of transformations to
2001/// additivity for binary response data. _Biometrika_ *68*, 357-363.
2002///
2003/// ## References:
2004///
2005/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
2006/// Statistics with S._ Fourth edition. Springer.
2007///
2008/// ## Examples:
2009///
2010/// ```r
2011/// mprob <- glm(cbind(Menarche, Total - Menarche) ~ Age,
2012/// binomial(link = probit), data = menarche)
2013/// ```
2014pub fn menarche() -> PolarsResult<DataFrame> {
2015 CsvReader::new(Cursor::new(include_str!("menarche.csv"))).finish()
2016}
2017
2018/// # Michelson's Speed of Light Data
2019///
2020/// ## Description:
2021///
2022/// Measurements of the speed of light in air, made between 5th June
2023/// and 2nd July, 1879. The data consists of five experiments, each
2024/// consisting of 20 consecutive runs. The response is the speed of
2025/// light in km/s, less 299000. The currently accepted value, on this
2026/// scale of measurement, is 734.5.
2027///
2028/// ## Usage:
2029///
2030/// michelson
2031///
2032/// ## Format:
2033///
2034/// The data frame contains the following components:
2035///
2036/// * ‘Expt’ The experiment number, from 1 to 5.
2037/// * ‘Run’ The run number within each experiment.
2038/// * ‘Speed’ Speed-of-light measurement.
2039///
2040/// ## Source:
2041///
2042/// A.J. Weekes (1986) _A Genstat Primer._ Edward Arnold.
2043///
2044/// S. M. Stigler (1977) Do robust estimators work with real data?
2045/// _Annals of Statistics_ *5*, 1055-1098.
2046///
2047/// ## References:
2048///
2049/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
2050/// Statistics with S._ Fourth edition. Springer.
2051pub fn michelson() -> PolarsResult<DataFrame> {
2052 CsvReader::new(Cursor::new(include_str!("michelson.csv"))).finish()
2053}
2054
2055/// # Minnesota High School Graduates of 1938
2056///
2057/// ## Description:
2058///
2059/// The Minnesota high school graduates of 1938 were classified
2060/// according to four factors, described below. The ‘minn38’ data
2061/// frame has 168 rows and 5 columns.
2062///
2063/// ## Usage:
2064///
2065/// minn38
2066///
2067/// ## Format:
2068///
2069/// This data frame contains the following columns:
2070///
2071/// * ‘hs’ high school rank: ‘"L"’, ‘"M"’ and ‘"U"’ for lower, middle
2072/// and upper third.
2073/// * ‘phs’ post high school status: Enrolled in college, (‘"C"’),
2074/// enrolled in non-collegiate school, (‘"N"’), employed
2075/// full-time, (‘"E"’) and other, (‘"O"’).
2076/// * ‘fol’ father's occupational level, (seven levels, ‘"F1"’, ‘"F2"’,
2077/// ..., ‘"F7"’).
2078/// * ‘sex’ sex: factor with levels‘"F"’ or ‘"M"’.
2079/// * ‘f’ frequency.
2080///
2081/// ## Source:
2082///
2083/// From R. L. Plackett, (1974) _The Analysis of Categorical Data._
2084/// London: Griffin
2085///
2086/// who quotes the data from
2087///
2088/// Hoyt, C. J., Krishnaiah, P. R. and Torrance, E. P. (1959) Analysis
2089/// of complex contingency tables, _J. Exp. Ed._ *27*, 187-194.
2090pub fn minn38() -> PolarsResult<DataFrame> {
2091 CsvReader::new(Cursor::new(include_str!("minn38.csv"))).finish()
2092}
2093
2094/// # Accelerated Life Testing of Motorettes
2095///
2096/// ## Description:
2097///
2098/// The ‘motors’ data frame has 40 rows and 3 columns. It describes
2099/// an accelerated life test at each of four temperatures of 10
2100/// motorettes, and has rather discrete times.
2101///
2102/// ## Usage:
2103///
2104/// motors
2105///
2106/// ## Format:
2107///
2108/// This data frame contains the following columns:
2109///
2110/// * ‘temp’ the temperature (degrees C) of the test.
2111/// * ‘time’ the time in hours to failure or censoring at 8064 hours (=
2112/// 336 days).
2113/// * ‘cens’ an indicator variable for death.
2114///
2115/// ## Source:
2116///
2117/// Kalbfleisch, J. D. and Prentice, R. L. (1980) _The Statistical
2118/// Analysis of Failure Time Data._ New York: Wiley.
2119///
2120/// ## Taken from:
2121///
2122/// Nelson, W. D. and Hahn, G. J. (1972) Linear regression of a
2123/// regression relationship from censored data. Part 1 - simple
2124/// methods and their application. _Technometrics_, *14*, 247-276.
2125///
2126/// ## References:
2127///
2128/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
2129/// Statistics with S._ Fourth edition. Springer.
2130///
2131/// ## Examples:
2132///
2133/// ```r
2134/// library(survival)
2135/// plot(survfit(Surv(time, cens) ~ factor(temp), motors), conf.int = FALSE)
2136/// # fit Weibull model
2137/// motor.wei <- survreg(Surv(time, cens) ~ temp, motors)
2138/// summary(motor.wei)
2139/// # and predict at 130C
2140/// unlist(predict(motor.wei, data.frame(temp=130), se.fit = TRUE))
2141///
2142/// motor.cox <- coxph(Surv(time, cens) ~ temp, motors)
2143/// summary(motor.cox)
2144/// # predict at temperature 200
2145/// plot(survfit(motor.cox, newdata = data.frame(temp=200),
2146/// conf.type = "log-log"))
2147/// summary( survfit(motor.cox, newdata = data.frame(temp=130)) )
2148/// ```
2149pub fn motors() -> PolarsResult<DataFrame> {
2150 CsvReader::new(Cursor::new(include_str!("motors.csv"))).finish()
2151}
2152
2153/// # Effect of Calcium Chloride on Muscle Contraction in Rat Hearts
2154///
2155/// ## Description:
2156///
2157/// The purpose of this experiment was to assess the influence of
2158/// calcium in solution on the contraction of heart muscle in rats.
2159/// The left auricle of 21 rat hearts was isolated and on several
2160/// occasions a constant-length strip of tissue was electrically
2161/// stimulated and dipped into various concentrations of calcium
2162/// chloride solution, after which the shortening of the strip was
2163/// accurately measured as the response.
2164///
2165/// ## Usage:
2166///
2167/// muscle
2168///
2169/// ## Format:
2170///
2171/// This data frame contains the following columns:
2172///
2173/// * ‘Strip’ which heart muscle strip was used?
2174/// * ‘Conc’ concentration of calcium chloride solution, in multiples of
2175/// 2.2 mM.
2176/// * ‘Length’ the change in length (shortening) of the strip,
2177/// (allegedly) in mm.
2178///
2179/// ## Source:
2180///
2181/// Linder, A., Chakravarti, I. M. and Vuagnat, P. (1964) Fitting
2182/// asymptotic regression curves with different asymptotes. In
2183/// _Contributions to Statistics. Presented to Professor P. C.
2184/// Mahalanobis on the occasion of his 70th birthday_, ed. C. R. Rao,
2185/// pp. 221-228. Oxford: Pergamon Press.
2186///
2187/// ## References:
2188///
2189/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
2190/// Statistics with S._ Fourth Edition. Springer.
2191///
2192/// ## Examples:
2193///
2194/// ```r
2195/// ## IGNORE_RDIFF_BEGIN
2196/// A <- model.matrix(~ Strip - 1, data=muscle)
2197/// rats.nls1 <- nls(log(Length) ~ cbind(A, rho^Conc),
2198/// data = muscle, start = c(rho=0.1), algorithm="plinear")
2199/// (B <- coef(rats.nls1))
2200///
2201/// st <- list(alpha = B[2:22], beta = B[23], rho = B[1])
2202/// (rats.nls2 <- nls(log(Length) ~ alpha[Strip] + beta*rho^Conc,
2203/// data = muscle, start = st))
2204/// ## IGNORE_RDIFF_END
2205///
2206/// Muscle <- with(muscle, {
2207/// Muscle <- expand.grid(Conc = sort(unique(Conc)), Strip = levels(Strip))
2208/// Muscle$Yhat <- predict(rats.nls2, Muscle)
2209/// Muscle <- cbind(Muscle, logLength = rep(as.numeric(NA), 126))
2210/// ind <- match(paste(Strip, Conc),
2211/// paste(Muscle$Strip, Muscle$Conc))
2212/// Muscle$logLength[ind] <- log(Length)
2213/// Muscle})
2214///
2215/// lattice::xyplot(Yhat ~ Conc | Strip, Muscle, as.table = TRUE,
2216/// ylim = range(c(Muscle$Yhat, Muscle$logLength), na.rm = TRUE),
2217/// subscripts = TRUE, xlab = "Calcium Chloride concentration (mM)",
2218/// ylab = "log(Length in mm)", panel =
2219/// function(x, y, subscripts, ...) {
2220/// panel.xyplot(x, Muscle$logLength[subscripts], ...)
2221/// llines(spline(x, y))
2222/// })
2223/// ```
2224pub fn muscle() -> PolarsResult<DataFrame> {
2225 CsvReader::new(Cursor::new(include_str!("muscle.csv"))).finish()
2226}
2227
2228/// # Newcomb's Measurements of the Passage Time of Light
2229///
2230/// ## Description:
2231///
2232/// A numeric vector giving the ‘Third Series’ of measurements of the
2233/// passage time of light recorded by Newcomb in 1882. The given
2234/// values divided by 1000 plus 24.8 give the time in millionths of a
2235/// second for light to traverse a known distance. The ‘true’ value is
2236/// now considered to be 33.02.
2237///
2238/// The dataset is given in the order in Staudte and Sheather.
2239/// Stigler (1977, Table 5) gives the dataset as
2240///
2241/// 28 26 33 24 34 -44 27 16 40 -2 29 22 24 21 25 30 23 29 31 19
2242/// 24 20 36 32 36 28 25 21 28 29 37 25 28 26 30 32 36 26 30 22
2243/// 36 23 27 27 28 27 31 27 26 33 26 32 32 24 39 28 24 25 32 25
2244/// 29 27 28 29 16 23
2245///
2246/// However, order is not relevant to its use as an example of robust
2247/// estimation. (Thanks to Anthony Unwin for bringing this difference
2248/// to our attention.)
2249///
2250/// ## Usage:
2251///
2252/// newcomb
2253///
2254/// ## Source:
2255///
2256/// S. M. Stigler (1973) Simon Newcomb, Percy Daniell, and the history
2257/// of robust estimation 1885-1920. _Journal of the American
2258/// Statistical Association_ *68*, 872-879.
2259///
2260/// S. M. Stigler (1977) Do robust estimators work with _real_ data?
2261/// _Annals of Statistics_, *5*, 1055-1098.
2262///
2263/// R. G. Staudte and S. J. Sheather (1990) _Robust Estimation and
2264/// Testing._ Wiley.
2265pub fn newcomb() -> PolarsResult<DataFrame> {
2266 CsvReader::new(Cursor::new(include_str!("newcomb.csv"))).finish()
2267}
2268
2269/// # Eighth-Grade Pupils in the Netherlands
2270///
2271/// ## Description:
2272///
2273/// Snijders and Bosker (1999) use as a running example a study of
2274/// 2287 eighth-grade pupils (aged about 11) in 132 classes in 131
2275/// schools in the Netherlands. Only the variables used in our
2276/// examples are supplied.
2277///
2278/// ## Usage:
2279///
2280/// nlschools
2281///
2282/// ## Format:
2283///
2284/// This data frame contains 2287 rows and the following columns:
2285///
2286/// * ‘lang’ language test score.
2287/// * ‘IQ’ verbal IQ.
2288/// * ‘class’ class ID.
2289/// * ‘GS’ class size: number of eighth-grade pupils recorded in the
2290/// class (there may be others: see ‘COMB’, and some may have
2291/// been omitted with missing values).
2292/// * ‘SES’ social-economic status of pupil's family.
2293/// * ‘COMB’ were the pupils taught in a multi-grade class (‘0/1’)?
2294/// Classes which contained pupils from grades 7 and 8 are coded
2295/// ‘1’, but only eighth-graders were tested.
2296///
2297/// ## Source:
2298///
2299/// Snijders, T. A. B. and Bosker, R. J. (1999) _Multilevel Analysis.
2300/// An Introduction to Basic and Advanced Multilevel Modelling._
2301/// London: Sage.
2302///
2303/// ## References:
2304///
2305/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
2306/// Statistics with S._ Fourth edition. Springer.
2307///
2308/// ## Examples:
2309///
2310/// ```r
2311/// nl1 <- within(nlschools, {
2312/// IQave <- tapply(IQ, class, mean)[as.character(class)]
2313/// IQ <- IQ - IQave
2314/// })
2315/// cen <- c("IQ", "IQave", "SES")
2316/// nl1[cen] <- scale(nl1[cen], center = TRUE, scale = FALSE)
2317///
2318/// nl.lme <- nlme::lme(lang ~ IQ*COMB + IQave + SES,
2319/// random = ~ IQ | class, data = nl1)
2320/// ## IGNORE_RDIFF_BEGIN
2321/// summary(nl.lme)
2322/// ## IGNORE_RDIFF_END
2323/// ```
2324pub fn nlschools() -> PolarsResult<DataFrame> {
2325 CsvReader::new(Cursor::new(include_str!("nlschools.csv"))).finish()
2326}
2327
2328// /// # Classical N, P, K Factorial Experiment
2329// ///
2330// /// ## Description:
2331// ///
2332// /// A classical N, P, K (nitrogen, phosphate, potassium) factorial
2333// /// experiment on the growth of peas conducted on 6 blocks. Each half
2334// /// of a fractional factorial design confounding the NPK interaction
2335// /// was used on 3 of the plots.
2336// ///
2337// /// ## Usage:
2338// ///
2339// /// npk
2340// ///
2341// /// ## Format:
2342// ///
2343// /// The ‘npk’ data frame has 24 rows and 5 columns:
2344// ///
2345// /// * ‘block’ which block (label 1 to 6).
2346// /// * ‘N’ indicator (0/1) for the application of nitrogen.
2347// /// * ‘P’ indicator (0/1) for the application of phosphate.
2348// /// * ‘K’ indicator (0/1) for the application of potassium.
2349// /// * ‘yield’ Yield of peas, in pounds/plot (the plots were (1/70)
2350// /// acre).
2351// ///
2352// /// ## Note:
2353// ///
2354// /// This dataset is also contained in R 3.0.2 and later.
2355// ///
2356// /// ## Source:
2357// ///
2358// /// Imperial College, London, M.Sc. exercise sheet.
2359// ///
2360// /// ## References:
2361// ///
2362// /// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
2363// /// Statistics with S._ Fourth edition. Springer.
2364// ///
2365// /// ## Examples:
2366// ///
2367// /// ```r
2368// /// options(contrasts = c("contr.sum", "contr.poly"))
2369// /// npk.aov <- aov(yield ~ block + N*P*K, npk)
2370// /// ## IGNORE_RDIFF_BEGIN
2371// /// npk.aov
2372// /// summary(npk.aov)
2373// /// alias(npk.aov)
2374// /// coef(npk.aov)
2375// /// options(contrasts = c("contr.treatment", "contr.poly"))
2376// /// npk.aov1 <- aov(yield ~ block + N + K, data = npk)
2377// /// summary.lm(npk.aov1)
2378// /// se.contrast(npk.aov1, list(N=="0", N=="1"), data = npk)
2379// /// model.tables(npk.aov1, type = "means", se = TRUE)
2380// /// ## IGNORE_RDIFF_END
2381// /// ```
2382// pub fn npk() -> PolarsResult<DataFrame> {
2383// CsvReader::new(Cursor::new(include_str!("npk.csv"))).finish()
2384// }
2385
2386/// # US Naval Petroleum Reserve No. 1 data
2387///
2388/// ## Description:
2389///
2390/// Data on the locations, porosity and permeability (a measure of oil
2391/// flow) on 104 oil wells in the US Naval Petroleum Reserve No. 1 in
2392/// California.
2393///
2394/// ## Usage:
2395///
2396/// npr1
2397///
2398/// ## Format:
2399///
2400/// This data frame contains the following columns:
2401///
2402/// * ‘x’ x coordinates, in miles (origin unspecified)..
2403/// * ‘y’ y coordinates, in miles.
2404/// * ‘perm’ permeability in milli-Darcies.
2405/// * ‘por’ porosity (%).
2406///
2407/// ## Source:
2408///
2409/// Maher, J.C., Carter, R.D. and Lantz, R.J. (1975) Petroleum geology
2410/// of Naval Petroleum Reserve No. 1, Elk Hills, Kern County,
2411/// California. _USGS Professional Paper_ *912*.
2412///
2413/// ## References:
2414///
2415/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
2416/// Statistics with S._ Fourth edition. Springer.
2417pub fn npr1() -> PolarsResult<DataFrame> {
2418 CsvReader::new(Cursor::new(include_str!("npr1.csv"))).finish()
2419}
2420
2421/// # Data from an Oats Field Trial
2422///
2423/// ## Description:
2424///
2425/// The yield of oats from a split-plot field trial using three
2426/// varieties and four levels of manurial treatment. The experiment
2427/// was laid out in 6 blocks of 3 main plots, each split into 4
2428/// sub-plots. The varieties were applied to the main plots and the
2429/// manurial treatments to the sub-plots.
2430///
2431/// ## Usage:
2432///
2433/// oats
2434///
2435/// ## Format:
2436///
2437/// This data frame contains the following columns:
2438///
2439/// * ‘B’ Blocks, levels I, II, III, IV, V and VI.
2440/// * ‘V’ Varieties, 3 levels.
2441/// * ‘N’ Nitrogen (manurial) treatment, levels 0.0cwt, 0.2cwt, 0.4cwt
2442/// and 0.6cwt, showing the application in cwt/acre.
2443/// * ‘Y’ Yields in 1/4lbs per sub-plot, each of area 1/80 acre.
2444///
2445/// ## Source:
2446///
2447/// Yates, F. (1935) Complex experiments, _Journal of the Royal
2448/// Statistical Society Suppl._ *2*, 181-247.
2449///
2450/// Also given in Yates, F. (1970) _Experimental design: Selected
2451/// papers of Frank Yates, C.B.E, F.R.S._ London: Griffin.
2452///
2453/// ## References:
2454///
2455/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
2456/// Statistics with S._ Fourth edition. Springer.
2457///
2458/// ## Examples:
2459///
2460/// ```r
2461/// oats$Nf <- ordered(oats$N, levels = sort(levels(oats$N)))
2462/// oats.aov <- aov(Y ~ Nf*V + Error(B/V), data = oats, qr = TRUE)
2463/// ## IGNORE_RDIFF_BEGIN
2464/// summary(oats.aov)
2465/// summary(oats.aov, split = list(Nf=list(L=1, Dev=2:3)))
2466/// ## IGNORE_RDIFF_END
2467/// par(mfrow = c(1,2), pty = "s")
2468/// plot(fitted(oats.aov[[4]]), studres(oats.aov[[4]]))
2469/// abline(h = 0, lty = 2)
2470/// oats.pr <- proj(oats.aov)
2471/// qqnorm(oats.pr[[4]][,"Residuals"], ylab = "Stratum 4 residuals")
2472/// qqline(oats.pr[[4]][,"Residuals"])
2473///
2474/// par(mfrow = c(1,1), pty = "m")
2475/// oats.aov2 <- aov(Y ~ N + V + Error(B/V), data = oats, qr = TRUE)
2476/// model.tables(oats.aov2, type = "means", se = TRUE)
2477/// ```
2478pub fn oats() -> PolarsResult<DataFrame> {
2479 CsvReader::new(Cursor::new(include_str!("oats.csv"))).finish()
2480}
2481
2482/// # Tests of Auditory Perception in Children with OME
2483///
2484/// ## Description:
2485///
2486/// Experiments were performed on children on their ability to
2487/// differentiate a signal in broad-band noise. The noise was played
2488/// from a pair of speakers and a signal was added to just one
2489/// channel; the subject had to turn his/her head to the channel with
2490/// the added signal. The signal was either coherent (the amplitude
2491/// of the noise was increased for a period) or incoherent
2492/// (independent noise was added for the same period to form the same
2493/// increase in power).
2494///
2495/// The threshold used in the original analysis was the stimulus
2496/// loudness needs to get 75% correct responses. Some of the children
2497/// had suffered from otitis media with effusion (OME).
2498///
2499/// ## Usage:
2500///
2501/// OME
2502///
2503/// ## Format:
2504///
2505/// The ‘OME’ data frame has 1129 rows and 7 columns:
2506///
2507/// * ‘ID’ Subject ID (1 to 99, with some IDs missing). A few subjects
2508/// were measured at different ages.
2509/// * ‘OME’ ‘"low"’ or ‘"high"’ or ‘"N/A"’ (at ages other than 30 and 60
2510/// months).
2511/// * ‘Age’ Age of the subject (months).
2512/// * ‘Loud’ Loudness of stimulus, in decibels.
2513/// * ‘Noise’ Whether the signal in the stimulus was ‘"coherent"’ or
2514/// ‘"incoherent"’.
2515/// * ‘Correct’ Number of correct responses from ‘Trials’ trials.
2516/// * ‘Trials’ Number of trials performed.
2517///
2518/// ## Background:
2519///
2520/// The experiment was to study otitis media with effusion (OME), a
2521/// very common childhood condition where the middle ear space, which
2522/// is normally air-filled, becomes congested by a fluid. There is a
2523/// concomitant fluctuating, conductive hearing loss which can result
2524/// in various language, cognitive and social deficits. The term
2525/// ‘binaural hearing’ is used to describe the listening conditions in
2526/// which the brain is processing information from both ears at the
2527/// same time. The brain computes differences in the intensity and/or
2528/// timing of signals arriving at each ear which contributes to sound
2529/// localisation and also to our ability to hear in background noise.
2530///
2531/// Some years ago, it was found that children of 7-8 years with a
2532/// history of significant OME had significantly worse binaural
2533/// hearing than children without such a history, despite having
2534/// equivalent sensitivity. The question remained as to whether it
2535/// was the timing, the duration, or the degree of severity of the
2536/// otitis media episodes during critical periods, which affected
2537/// later binaural hearing. In an attempt to begin to answer this
2538/// question, 95 children were monitored for the presence of effusion
2539/// every month since birth. On the basis of OME experience in their
2540/// first two years, the test population was split into one group of
2541/// high OME prevalence and one of low prevalence.
2542///
2543/// ## Source:
2544///
2545/// Sarah Hogan, Dept of Physiology, University of Oxford, via Dept of
2546/// Statistics Consulting Service
2547///
2548/// ## Examples:
2549///
2550/// ```r
2551/// # Fit logistic curve from p = 0.5 to p = 1.0
2552/// fp1 <- deriv(~ 0.5 + 0.5/(1 + exp(-(x-L75)/scal)),
2553/// c("L75", "scal"),
2554/// function(x,L75,scal)NULL)
2555/// nls(Correct/Trials ~ fp1(Loud, L75, scal), data = OME,
2556/// start = c(L75=45, scal=3))
2557/// nls(Correct/Trials ~ fp1(Loud, L75, scal),
2558/// data = OME[OME$Noise == "coherent",],
2559/// start=c(L75=45, scal=3))
2560/// nls(Correct/Trials ~ fp1(Loud, L75, scal),
2561/// data = OME[OME$Noise == "incoherent",],
2562/// start = c(L75=45, scal=3))
2563///
2564/// # individual fits for each experiment
2565///
2566/// aa <- factor(OME$Age)
2567/// ab <- 10*OME$ID + unclass(aa)
2568/// ac <- unclass(factor(ab))
2569/// OME$UID <- as.vector(ac)
2570/// OME$UIDn <- OME$UID + 0.1*(OME$Noise == "incoherent")
2571/// rm(aa, ab, ac)
2572/// OMEi <- OME
2573///
2574/// library(nlme)
2575/// fp2 <- deriv(~ 0.5 + 0.5/(1 + exp(-(x-L75)/2)),
2576/// "L75", function(x,L75) NULL)
2577/// dec <- getOption("OutDec")
2578/// options(show.error.messages = FALSE, OutDec=".")
2579/// OMEi.nls <- nlsList(Correct/Trials ~ fp2(Loud, L75) | UIDn,
2580/// data = OMEi, start = list(L75=45), control = list(maxiter=100))
2581/// options(show.error.messages = TRUE, OutDec=dec)
2582/// tmp <- sapply(OMEi.nls, function(X)
2583/// {if(is.null(X)) NA else as.vector(coef(X))})
2584/// OMEif <- data.frame(UID = round(as.numeric((names(tmp)))),
2585/// Noise = rep(c("coherent", "incoherent"), 110),
2586/// L75 = as.vector(tmp), stringsAsFactors = TRUE)
2587/// OMEif$Age <- OME$Age[match(OMEif$UID, OME$UID)]
2588/// OMEif$OME <- OME$OME[match(OMEif$UID, OME$UID)]
2589/// OMEif <- OMEif[OMEif$L75 > 30,]
2590/// summary(lm(L75 ~ Noise/Age, data = OMEif, na.action = na.omit))
2591/// summary(lm(L75 ~ Noise/(Age + OME), data = OMEif,
2592/// subset = (Age >= 30 & Age <= 60),
2593/// na.action = na.omit), correlation = FALSE)
2594///
2595/// # Or fit by weighted least squares
2596/// fpl75 <- deriv(~ sqrt(n)*(r/n - 0.5 - 0.5/(1 + exp(-(x-L75)/scal))),
2597/// c("L75", "scal"),
2598/// function(r,n,x,L75,scal) NULL)
2599/// nls(0 ~ fpl75(Correct, Trials, Loud, L75, scal),
2600/// data = OME[OME$Noise == "coherent",],
2601/// start = c(L75=45, scal=3))
2602/// nls(0 ~ fpl75(Correct, Trials, Loud, L75, scal),
2603/// data = OME[OME$Noise == "incoherent",],
2604/// start = c(L75=45, scal=3))
2605///
2606/// # Test to see if the curves shift with age
2607/// fpl75age <- deriv(~sqrt(n)*(r/n - 0.5 - 0.5/(1 +
2608/// exp(-(x-L75-slope*age)/scal))),
2609/// c("L75", "slope", "scal"),
2610/// function(r,n,x,age,L75,slope,scal) NULL)
2611/// OME.nls1 <-
2612/// nls(0 ~ fpl75age(Correct, Trials, Loud, Age, L75, slope, scal),
2613/// data = OME[OME$Noise == "coherent",],
2614/// start = c(L75=45, slope=0, scal=2))
2615/// sqrt(diag(vcov(OME.nls1)))
2616///
2617/// OME.nls2 <-
2618/// nls(0 ~ fpl75age(Correct, Trials, Loud, Age, L75, slope, scal),
2619/// data = OME[OME$Noise == "incoherent",],
2620/// start = c(L75=45, slope=0, scal=2))
2621/// sqrt(diag(vcov(OME.nls2)))
2622///
2623/// # Now allow random effects by using NLME
2624/// OMEf <- OME[rep(1:nrow(OME), OME$Trials),]
2625/// OMEf$Resp <- with(OME, rep(rep(c(1,0), length(Trials)),
2626/// t(cbind(Correct, Trials-Correct))))
2627/// OMEf <- OMEf[, -match(c("Correct", "Trials"), names(OMEf))]
2628///
2629/// ## Not run:
2630/// ## these fail in R on most platforms
2631/// fp2 <- deriv(~ 0.5 + 0.5/(1 + exp(-(x-L75)/exp(lsc))),
2632/// c("L75", "lsc"),
2633/// function(x, L75, lsc) NULL)
2634/// try(summary(nlme(Resp ~ fp2(Loud, L75, lsc),
2635/// fixed = list(L75 ~ Age, lsc ~ 1),
2636/// random = L75 + lsc ~ 1 | UID,
2637/// data = OMEf[OMEf$Noise == "coherent",], method = "ML",
2638/// start = list(fixed=c(L75=c(48.7, -0.03), lsc=0.24)), verbose = TRUE)))
2639///
2640/// try(summary(nlme(Resp ~ fp2(Loud, L75, lsc),
2641/// fixed = list(L75 ~ Age, lsc ~ 1),
2642/// random = L75 + lsc ~ 1 | UID,
2643/// data = OMEf[OMEf$Noise == "incoherent",], method = "ML",
2644/// start = list(fixed=c(L75=c(41.5, -0.1), lsc=0)), verbose = TRUE)))
2645/// ## End(Not run)
2646/// ```
2647pub fn ome() -> PolarsResult<DataFrame> {
2648 CsvReader::new(Cursor::new(include_str!("OME.csv"))).finish()
2649}
2650
2651/// # The Painter's Data of de Piles
2652///
2653/// ## Description:
2654///
2655/// The subjective assessment, on a 0 to 20 integer scale, of 54
2656/// classical painters. The painters were assessed on four
2657/// characteristics: composition, drawing, colour and expression. The
2658/// data is due to the Eighteenth century art critic, de Piles.
2659///
2660/// ## Usage:
2661///
2662/// painters
2663///
2664/// ## Format:
2665///
2666/// The row names of the data frame are the painters. The components
2667/// are:
2668///
2669/// * ‘Composition’ Composition score.
2670/// * ‘Drawing’ Drawing score.
2671/// * ‘Colour’ Colour score.
2672/// * ‘Expression’ Expression score.
2673/// * ‘School’ The school to which a painter belongs, as indicated by a
2674/// factor level code as follows: ‘"A"’: Renaissance; ‘"B"’:
2675/// Mannerist; ‘"C"’: Seicento; ‘"D"’: Venetian; ‘"E"’: Lombard;
2676/// ‘"F"’: Sixteenth Century; ‘"G"’: Seventeenth Century; ‘"H"’:
2677/// French.
2678///
2679/// ## Source:
2680///
2681/// A. J. Weekes (1986) _A Genstat Primer._ Edward Arnold.
2682///
2683/// M. Davenport and G. Studdert-Kennedy (1972) The statistical
2684/// analysis of aesthetic judgement: an exploration. _Applied
2685/// Statistics_ *21*, 324-333.
2686///
2687/// I. T. Jolliffe (1986) _Principal Component Analysis._ Springer.
2688///
2689/// ## References:
2690///
2691/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
2692/// Statistics with S._ Fourth edition. Springer.
2693pub fn painters() -> PolarsResult<DataFrame> {
2694 CsvReader::new(Cursor::new(include_str!("painters.csv"))).finish()
2695}
2696
2697/// # N. L. Prater's Petrol Refinery Data
2698///
2699/// ## Description:
2700///
2701/// The yield of a petroleum refining process with four covariates.
2702/// The crude oil appears to come from only 10 distinct samples.
2703///
2704/// These data were originally used by Prater (1956) to build an
2705/// estimation equation for the yield of the refining process of crude
2706/// oil to gasoline.
2707///
2708/// ## Usage:
2709///
2710/// petrol
2711///
2712/// ## Format:
2713///
2714/// The variables are as follows
2715///
2716/// * ‘No’ crude oil sample identification label. (Factor.)
2717/// * ‘SG’ specific gravity, degrees API. (Constant within sample.)
2718/// * ‘VP’ vapour pressure in pounds per square inch. (Constant within
2719/// sample.)
2720/// * ‘V10’ volatility of crude; ASTM 10% point. (Constant within
2721/// sample.)
2722/// * ‘EP’ desired volatility of gasoline. (The end point. Varies
2723/// within sample.)
2724/// * ‘Y’ yield as a percentage of crude.
2725///
2726/// ## Source:
2727///
2728/// N. H. Prater (1956) Estimate gasoline yields from crudes.
2729/// _Petroleum Refiner_ *35*, 236-238.
2730///
2731/// This dataset is also given in D. J. Hand, F. Daly, K. McConway, D.
2732/// Lunn and E. Ostrowski (eds) (1994) _A Handbook of Small Data
2733/// Sets._ Chapman & Hall.
2734///
2735/// ## References:
2736///
2737/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
2738/// Statistics with S._ Fourth edition. Springer.
2739///
2740/// ## Examples:
2741///
2742/// ```r
2743/// library(nlme)
2744/// Petrol <- petrol
2745/// Petrol[, 2:5] <- scale(as.matrix(Petrol[, 2:5]), scale = FALSE)
2746/// pet3.lme <- lme(Y ~ SG + VP + V10 + EP,
2747/// random = ~ 1 | No, data = Petrol)
2748/// pet3.lme <- update(pet3.lme, method = "ML")
2749/// pet4.lme <- update(pet3.lme, fixed. = Y ~ V10 + EP)
2750/// anova(pet4.lme, pet3.lme)
2751/// ```
2752pub fn petrol() -> PolarsResult<DataFrame> {
2753 CsvReader::new(Cursor::new(include_str!("petrol.csv"))).finish()
2754}
2755
2756/// # Belgium Phone Calls 1950-1973
2757///
2758/// ## Description:
2759///
2760/// A list object with the annual numbers of telephone calls, in
2761/// Belgium. The components are:
2762///
2763/// * ‘year’ last two digits of the year.
2764/// * ‘calls’ number of telephone calls made (in millions of calls).
2765///
2766/// ## Usage:
2767///
2768/// phones
2769///
2770/// ## Source:
2771///
2772/// P. J. Rousseeuw and A. M. Leroy (1987) _Robust Regression &
2773/// Outlier Detection._ Wiley.
2774///
2775/// ## References:
2776///
2777/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
2778/// Statistics with S._ Fourth edition. Springer.
2779pub fn phones() -> PolarsResult<DataFrame> {
2780 CsvReader::new(Cursor::new(include_str!("phones.csv"))).finish()
2781}
2782
2783/// # Diabetes in Pima Indian Women
2784///
2785/// ## Description:
2786///
2787/// A population of women who were at least 21 years old, of Pima
2788/// Indian heritage and living near Phoenix, Arizona, was tested for
2789/// diabetes according to World Health Organization criteria. The
2790/// data were collected by the US National Institute of Diabetes and
2791/// Digestive and Kidney Diseases. We used the 532 complete records
2792/// after dropping the (mainly missing) data on serum insulin.
2793///
2794/// ## Usage:
2795///
2796/// Pima.tr
2797/// Pima.tr2
2798/// Pima.te
2799///
2800/// ## Format:
2801///
2802/// These data frames contains the following columns:
2803///
2804/// * ‘npreg’ number of pregnancies.
2805/// * ‘glu’ plasma glucose concentration in an oral glucose tolerance
2806/// test.
2807/// * ‘bp’ diastolic blood pressure (mm Hg).
2808/// * ‘skin’ triceps skin fold thickness (mm).
2809/// * ‘bmi’ body mass index (weight in kg/(height in m)\^2).
2810/// * ‘ped’ diabetes pedigree function.
2811/// * ‘age’ age in years.
2812/// * ‘type’ ‘Yes’ or ‘No’, for diabetic according to WHO criteria.
2813///
2814/// ## Details:
2815///
2816/// The training set ‘Pima.tr’ contains a randomly selected set of 200
2817/// subjects, and ‘Pima.te’ contains the remaining 332 subjects.
2818/// ‘Pima.tr2’ contains ‘Pima.tr’ plus 100 subjects with missing
2819/// values in the explanatory variables.
2820///
2821/// ## Source:
2822///
2823/// Smith, J. W., Everhart, J. E., Dickson, W. C., Knowler, W. C. and
2824/// Johannes, R. S. (1988) Using the ADAP learning algorithm to
2825/// forecast the onset of _diabetes mellitus_. In _Proceedings of the
2826/// Symposium on Computer Applications in Medical Care (Washington,
2827/// 1988),_ ed. R. A. Greenes, pp. 261-265. Los Alamitos, CA: IEEE
2828/// Computer Society Press.
2829///
2830/// Ripley, B.D. (1996) _Pattern Recognition and Neural Networks._
2831/// Cambridge: Cambridge University Press.
2832pub fn pima_te() -> PolarsResult<DataFrame> {
2833 CsvReader::new(Cursor::new(include_str!("Pima.te.csv"))).finish()
2834}
2835
2836/// # Diabetes in Pima Indian Women
2837///
2838/// ## Description:
2839///
2840/// A population of women who were at least 21 years old, of Pima
2841/// Indian heritage and living near Phoenix, Arizona, was tested for
2842/// diabetes according to World Health Organization criteria. The
2843/// data were collected by the US National Institute of Diabetes and
2844/// Digestive and Kidney Diseases. We used the 532 complete records
2845/// after dropping the (mainly missing) data on serum insulin.
2846///
2847/// ## Usage:
2848///
2849/// Pima.tr
2850/// Pima.tr2
2851/// Pima.te
2852///
2853/// ## Format:
2854///
2855/// These data frames contains the following columns:
2856///
2857/// * ‘npreg’ number of pregnancies.
2858/// * ‘glu’ plasma glucose concentration in an oral glucose tolerance
2859/// test.
2860/// * ‘bp’ diastolic blood pressure (mm Hg).
2861/// * ‘skin’ triceps skin fold thickness (mm).
2862/// * ‘bmi’ body mass index (weight in kg/(height in m)\^2).
2863/// * ‘ped’ diabetes pedigree function.
2864/// * ‘age’ age in years.
2865/// * ‘type’ ‘Yes’ or ‘No’, for diabetic according to WHO criteria.
2866///
2867/// ## Details:
2868///
2869/// The training set ‘Pima.tr’ contains a randomly selected set of 200
2870/// subjects, and ‘Pima.te’ contains the remaining 332 subjects.
2871/// ‘Pima.tr2’ contains ‘Pima.tr’ plus 100 subjects with missing
2872/// values in the explanatory variables.
2873///
2874/// ## Source:
2875///
2876/// Smith, J. W., Everhart, J. E., Dickson, W. C., Knowler, W. C. and
2877/// Johannes, R. S. (1988) Using the ADAP learning algorithm to
2878/// forecast the onset of _diabetes mellitus_. In _Proceedings of the
2879/// Symposium on Computer Applications in Medical Care (Washington,
2880/// 1988),_ ed. R. A. Greenes, pp. 261-265. Los Alamitos, CA: IEEE
2881/// Computer Society Press.
2882///
2883/// Ripley, B.D. (1996) _Pattern Recognition and Neural Networks._
2884/// Cambridge: Cambridge University Press.
2885pub fn pima_tr() -> PolarsResult<DataFrame> {
2886 CsvReader::new(Cursor::new(include_str!("Pima.tr.csv"))).finish()
2887}
2888
2889/// # Diabetes in Pima Indian Women
2890///
2891/// ## Description:
2892///
2893/// A population of women who were at least 21 years old, of Pima
2894/// Indian heritage and living near Phoenix, Arizona, was tested for
2895/// diabetes according to World Health Organization criteria. The
2896/// data were collected by the US National Institute of Diabetes and
2897/// Digestive and Kidney Diseases. We used the 532 complete records
2898/// after dropping the (mainly missing) data on serum insulin.
2899///
2900/// ## Usage:
2901///
2902/// Pima.tr
2903/// Pima.tr2
2904/// Pima.te
2905///
2906/// ## Format:
2907///
2908/// These data frames contains the following columns:
2909///
2910/// * ‘npreg’ number of pregnancies.
2911/// * ‘glu’ plasma glucose concentration in an oral glucose tolerance
2912/// test.
2913/// * ‘bp’ diastolic blood pressure (mm Hg).
2914/// * ‘skin’ triceps skin fold thickness (mm).
2915/// * ‘bmi’ body mass index (weight in kg/(height in m)\^2).
2916/// * ‘ped’ diabetes pedigree function.
2917/// * ‘age’ age in years.
2918/// * ‘type’ ‘Yes’ or ‘No’, for diabetic according to WHO criteria.
2919///
2920/// ## Details:
2921///
2922/// The training set ‘Pima.tr’ contains a randomly selected set of 200
2923/// subjects, and ‘Pima.te’ contains the remaining 332 subjects.
2924/// ‘Pima.tr2’ contains ‘Pima.tr’ plus 100 subjects with missing
2925/// values in the explanatory variables.
2926///
2927/// ## Source:
2928///
2929/// Smith, J. W., Everhart, J. E., Dickson, W. C., Knowler, W. C. and
2930/// Johannes, R. S. (1988) Using the ADAP learning algorithm to
2931/// forecast the onset of _diabetes mellitus_. In _Proceedings of the
2932/// Symposium on Computer Applications in Medical Care (Washington,
2933/// 1988),_ ed. R. A. Greenes, pp. 261-265. Los Alamitos, CA: IEEE
2934/// Computer Society Press.
2935///
2936/// Ripley, B.D. (1996) _Pattern Recognition and Neural Networks._
2937/// Cambridge: Cambridge University Press.
2938pub fn pima_tr2() -> PolarsResult<DataFrame> {
2939 CsvReader::new(Cursor::new(include_str!("Pima.tr2.csv"))).finish()
2940}
2941
2942/// # Absenteeism from School in Rural New South Wales
2943///
2944/// ## Description:
2945///
2946/// The ‘quine’ data frame has 146 rows and 5 columns. Children from
2947/// Walgett, New South Wales, Australia, were classified by Culture,
2948/// Age, Sex and Learner status and the number of days absent from
2949/// school in a particular school year was recorded.
2950///
2951/// ## Usage:
2952///
2953/// quine
2954///
2955/// ## Format:
2956///
2957/// This data frame contains the following columns:
2958///
2959/// * ‘Eth’ ethnic background: Aboriginal or Not, (‘"A"’ or ‘"N"’).
2960/// * ‘Sex’ sex: factor with levels (‘"F"’ or ‘"M"’).
2961/// * ‘Age’ age group: Primary (‘"F0"’), or forms ‘"F1,"’ ‘"F2"’ or
2962/// ‘"F3"’.
2963/// * ‘Lrn’ learner status: factor with levels Average or Slow learner,
2964/// (‘"AL"’ or ‘"SL"’).
2965/// * ‘Days’ days absent from school in the year.
2966///
2967/// ## Source:
2968///
2969/// S. Quine, quoted in Aitkin, M. (1978) The analysis of unbalanced
2970/// cross classifications (with discussion). _Journal of the Royal
2971/// Statistical Society series A_ *141*, 195-223.
2972///
2973/// ## References:
2974///
2975/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
2976/// Statistics with S._ Fourth edition. Springer.
2977pub fn quine() -> PolarsResult<DataFrame> {
2978 CsvReader::new(Cursor::new(include_str!("quine.csv"))).finish()
2979}
2980
2981/// # Blood Pressure in Rabbits
2982///
2983/// ## Description:
2984///
2985/// Five rabbits were studied on two occasions, after treatment with
2986/// saline (control) and after treatment with the 5-HT_3 antagonist
2987/// MDL 72222. After each treatment ascending doses of
2988/// phenylbiguanide were injected intravenously at 10 minute intervals
2989/// and the responses of mean blood pressure measured. The goal was
2990/// to test whether the cardiogenic chemoreflex elicited by
2991/// phenylbiguanide depends on the activation of 5-HT_3 receptors.
2992///
2993/// ## Usage:
2994///
2995/// Rabbit
2996///
2997/// ## Format:
2998///
2999/// This data frame contains 60 rows and the following variables:
3000///
3001/// * ‘BPchange’ change in blood pressure relative to the start of the
3002/// experiment.
3003/// * ‘Dose’ dose of Phenylbiguanide in micrograms.
3004/// * ‘Run’ label of run (‘"C1"’ to ‘"C5"’, then ‘"M1"’ to ‘"M5"’).
3005/// * ‘Treatment’ placebo or the 5-HT_3 antagonist MDL 72222.
3006/// * ‘Animal’ label of animal used (‘"R1"’ to ‘"R5"’).
3007///
3008/// ## Source:
3009///
3010/// J. Ludbrook (1994) Repeated measurements and multiple comparisons
3011/// in cardiovascular research. _Cardiovascular Research_ *28*,
3012/// 303-311.
3013/// [The numerical data are not in the paper but were supplied by
3014/// Professor Ludbrook]
3015///
3016/// ## References:
3017///
3018/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
3019/// Statistics with S._ Fourth edition. Springer.
3020pub fn rabbit() -> PolarsResult<DataFrame> {
3021 CsvReader::new(Cursor::new(include_str!("Rabbit.csv"))).finish()
3022}
3023
3024/// # Road Accident Deaths in US States
3025///
3026/// ## Description:
3027///
3028/// A data frame with the annual deaths in road accidents for half the
3029/// US states.
3030///
3031/// ## Usage:
3032///
3033/// road
3034///
3035/// ## Format:
3036///
3037/// Columns are:
3038///
3039/// * ‘state’ name.
3040/// * ‘deaths’ number of deaths.
3041/// * ‘drivers’ number of drivers (in 10,000s).
3042/// * ‘popden’ population density in people per square mile.
3043/// * ‘rural’ length of rural roads, in 1000s of miles.
3044/// * ‘temp’ average daily maximum temperature in January.
3045/// * ‘fuel’ fuel consumption in 10,000,000 US gallons per year.
3046///
3047/// ## Source:
3048///
3049/// Imperial College, London M.Sc. exercise
3050pub fn road() -> PolarsResult<DataFrame> {
3051 CsvReader::new(Cursor::new(include_str!("road.csv"))).finish()
3052}
3053
3054/// # Numbers of Rotifers by Fluid Density
3055///
3056/// ## Description:
3057///
3058/// The data give the numbers of rotifers falling out of suspension
3059/// for different fluid densities. There are two species, ‘pm’
3060/// _Polyartha major_ and ‘kc’, _Keratella cochlearis_ and for each
3061/// species the number falling out and the total number are given.
3062///
3063/// ## Usage:
3064///
3065/// rotifer
3066///
3067/// ## Format:
3068///
3069/// * ‘density’ specific density of fluid.
3070/// * ‘pm.y’ number falling out for _P. major_.
3071/// * ‘pm.total’ total number of _P. major_.
3072/// * ‘kc.y’ number falling out for _K. cochlearis_.
3073/// * ‘kc.tot’ total number of _K. cochlearis_.
3074///
3075/// ## Source:
3076///
3077/// D. Collett (1991) _Modelling Binary Data._ Chapman & Hall. p. 217
3078pub fn rotifer() -> PolarsResult<DataFrame> {
3079 CsvReader::new(Cursor::new(include_str!("rotifer.csv"))).finish()
3080}
3081
3082/// # Accelerated Testing of Tyre Rubber
3083///
3084/// ## Description:
3085///
3086/// Data frame from accelerated testing of tyre rubber.
3087///
3088/// ## Usage:
3089///
3090/// Rubber
3091///
3092/// ## Format:
3093///
3094/// * ‘loss’ the abrasion loss in gm/hr.
3095/// * ‘hard’ the hardness in Shore units.
3096/// * ‘tens’ tensile strength in kg/sq m.
3097///
3098/// ## Source:
3099///
3100/// O.L. Davies (1947) _Statistical Methods in Research and
3101/// Production._ Oliver and Boyd, Table 6.1 p. 119.
3102///
3103/// O.L. Davies and P.L. Goldsmith (1972) _Statistical Methods in
3104/// Research and Production._ 4th edition, Longmans, Table 8.1 p. 239.
3105///
3106/// ## References:
3107///
3108/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
3109/// Statistics with S-PLUS._ Fourth Edition. Springer.
3110pub fn rubber() -> PolarsResult<DataFrame> {
3111 CsvReader::new(Cursor::new(include_str!("Rubber.csv"))).finish()
3112}
3113
3114/// # Ships Damage Data
3115///
3116/// ## Description:
3117///
3118/// Data frame giving the number of damage incidents and aggregate
3119/// months of service by ship type, year of construction, and period
3120/// of operation.
3121///
3122/// ## Usage:
3123///
3124/// ships
3125///
3126/// ## Format:
3127///
3128/// * ‘type’ type: ‘"A"’ to ‘"E"’.
3129/// * ‘year’ year of construction: 1960-64, 65-69, 70-74, 75-79 (coded
3130/// as ‘"60"’, ‘"65"’, ‘"70"’, ‘"75"’).
3131/// * ‘period’ period of operation : 1960-74, 75-79.
3132/// * ‘service’ aggregate months of service.
3133/// * ‘incidents’ number of damage incidents.
3134///
3135/// ## Source:
3136///
3137/// P. McCullagh and J. A. Nelder, (1983), _Generalized Linear
3138/// Models._ Chapman & Hall, section 6.3.2, page 137
3139pub fn ships() -> PolarsResult<DataFrame> {
3140 CsvReader::new(Cursor::new(include_str!("ships.csv"))).finish()
3141}
3142
3143/// # Shoe wear data of Box, Hunter and Hunter
3144///
3145/// ## Description:
3146///
3147/// A list of two vectors, giving the wear of shoes of materials A and
3148/// B for one foot each of ten boys.
3149///
3150/// ## Usage:
3151///
3152/// shoes
3153///
3154/// ## Source:
3155///
3156/// G. E. P. Box, W. G. Hunter and J. S. Hunter (1978) _Statistics for
3157/// Experimenters._ Wiley, p. 100
3158///
3159/// ## References:
3160///
3161/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
3162/// Statistics with S._ Fourth edition. Springer.
3163pub fn shoes() -> PolarsResult<DataFrame> {
3164 CsvReader::new(Cursor::new(include_str!("shoes.csv"))).finish()
3165}
3166
3167/// # Percentage of Shrimp in Shrimp Cocktail
3168///
3169/// ## Description:
3170///
3171/// A numeric vector with 18 determinations by different laboratories
3172/// of the amount (percentage of the declared total weight) of shrimp
3173/// in shrimp cocktail.
3174///
3175/// ## Usage:
3176///
3177/// shrimp
3178///
3179/// ## Source:
3180///
3181/// F. J. King and J. J. Ryan (1976) Collaborative study of the
3182/// determination of the amount of shrimp in shrimp cocktail. _J. Off.
3183/// Anal. Chem._ *59*, 644-649.
3184///
3185/// R. G. Staudte and S. J. Sheather (1990) _Robust Estimation and
3186/// Testing._ Wiley.
3187pub fn shrimp() -> PolarsResult<DataFrame> {
3188 CsvReader::new(Cursor::new(include_str!("shrimp.csv"))).finish()
3189}
3190
3191/// # Space Shuttle Autolander Problem
3192///
3193/// ## Description:
3194///
3195/// The ‘shuttle’ data frame has 256 rows and 7 columns. The first
3196/// six columns are categorical variables giving example conditions;
3197/// the seventh is the decision. The first 253 rows are the training
3198/// set, the last 3 the test conditions.
3199///
3200/// ## Usage:
3201///
3202/// shuttle
3203///
3204/// ## Format:
3205///
3206/// This data frame contains the following factor columns:
3207///
3208/// * ‘stability’ stable positioning or not (‘stab’ / ‘xstab’).
3209/// * ‘error’ size of error (‘MM’ / ‘SS’ / ‘LX’ / ‘XL’).
3210/// * ‘sign’ sign of error, positive or negative (‘pp’ / ‘nn’).
3211/// * ‘wind’ wind sign (‘head’ / ‘tail’).
3212/// * ‘magn’ wind strength (‘Light’ / ‘Medium’ / ‘Strong’ / ‘Out of
3213/// Range’).
3214/// * ‘vis’ visibility (‘yes’ / ‘no’).
3215/// * ‘use’ use the autolander or not. (‘auto’ / ‘noauto’.)
3216///
3217/// ## Source:
3218///
3219/// D. Michie (1989) Problems of computer-aided concept formation. In
3220/// _Applications of Expert Systems 2_, ed. J. R. Quinlan, Turing
3221/// Institute Press / Addison-Wesley, pp. 310-333.
3222///
3223/// ## References:
3224///
3225/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
3226/// Statistics with S._ Fourth edition. Springer.
3227pub fn shuttle() -> PolarsResult<DataFrame> {
3228 CsvReader::new(Cursor::new(include_str!("shuttle.csv"))).finish()
3229}
3230
3231/// # Growth Curves for Sitka Spruce Trees in 1988
3232///
3233/// ## Description:
3234///
3235/// The ‘Sitka’ data frame has 395 rows and 4 columns. It gives
3236/// repeated measurements on the log-size of 79 Sitka spruce trees, 54
3237/// of which were grown in ozone-enriched chambers and 25 were
3238/// controls. The size was measured five times in 1988, at roughly
3239/// monthly intervals.
3240///
3241/// ## Usage:
3242///
3243/// Sitka
3244///
3245/// ## Format:
3246///
3247/// This data frame contains the following columns:
3248///
3249/// * ‘size’ measured size (height times diameter squared) of tree, on
3250/// log scale.
3251/// * ‘Time’ time of measurement in days since 1 January 1988.
3252/// * ‘tree’ number of tree.
3253/// * ‘treat’ either ‘"ozone"’ for an ozone-enriched chamber or
3254/// ‘"control"’.
3255///
3256/// ## Source:
3257///
3258/// P. J. Diggle, K.-Y. Liang and S. L. Zeger (1994) _Analysis of
3259/// Longitudinal Data._ Clarendon Press, Oxford
3260///
3261/// ## References:
3262///
3263/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
3264/// Statistics with S._ Fourth edition. Springer.
3265///
3266/// ## See Also:
3267///
3268/// ‘Sitka89’.
3269pub fn sitka() -> PolarsResult<DataFrame> {
3270 CsvReader::new(Cursor::new(include_str!("Sitka.csv"))).finish()
3271}
3272
3273/// # Growth Curves for Sitka Spruce Trees in 1989
3274///
3275/// ## Description:
3276///
3277/// The ‘Sitka89’ data frame has 632 rows and 4 columns. It gives
3278/// repeated measurements on the log-size of 79 Sitka spruce trees, 54
3279/// of which were grown in ozone-enriched chambers and 25 were
3280/// controls. The size was measured eight times in 1989, at roughly
3281/// monthly intervals.
3282///
3283/// ## Usage:
3284///
3285/// Sitka89
3286///
3287/// ## Format:
3288///
3289/// This data frame contains the following columns:
3290///
3291/// * ‘size’ measured size (height times diameter squared) of tree, on
3292/// log scale.
3293/// * ‘Time’ time of measurement in days since 1 January 1988.
3294/// * ‘tree’ number of tree.
3295/// * ‘treat’ either ‘"ozone"’ for an ozone-enriched chamber or
3296/// ‘"control"’.
3297///
3298/// ## Source:
3299///
3300/// P. J. Diggle, K.-Y. Liang and S. L. Zeger (1994) _Analysis of
3301/// Longitudinal Data._ Clarendon Press, Oxford
3302///
3303/// ## See Also:
3304///
3305/// ‘Sitka’
3306pub fn sitka89() -> PolarsResult<DataFrame> {
3307 CsvReader::new(Cursor::new(include_str!("Sitka89.csv"))).finish()
3308}
3309
3310/// # AFM Compositions of Aphyric Skye Lavas
3311///
3312/// ## Description:
3313///
3314/// The ‘Skye’ data frame has 23 rows and 3 columns.
3315///
3316/// ## Usage:
3317///
3318/// Skye
3319///
3320/// ## Format:
3321///
3322/// This data frame contains the following columns:
3323///
3324/// * ‘A’ Percentage of sodium and potassium oxides.
3325/// * ‘F’ Percentage of iron oxide.
3326/// * ‘M’ Percentage of magnesium oxide.
3327///
3328/// ## Source:
3329///
3330/// R. N. Thompson, J. Esson and A. C. Duncan (1972) Major element
3331/// chemical variation in the Eocene lavas of the Isle of Skye. _J.
3332/// Petrology_, *13*, 219-253.
3333///
3334/// ## References:
3335///
3336/// J. Aitchison (1986) _The Statistical Analysis of Compositional
3337/// Data._ Chapman and Hall, p.360.
3338///
3339/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
3340/// Statistics with S._ Fourth edition. Springer.
3341///
3342/// ## Examples:
3343///
3344/// ```r
3345/// # ternary() is from the on-line answers.
3346/// ternary <- function(X, pch = par("pch"), lcex = 1,
3347/// add = FALSE, ord = 1:3, ...)
3348/// {
3349/// X <- as.matrix(X)
3350/// if(any(X < 0)) stop("X must be non-negative")
3351/// s <- drop(X %*% rep(1, ncol(X)))
3352/// if(any(s<=0)) stop("each row of X must have a positive sum")
3353/// if(max(abs(s-1)) > 1e-6) {
3354/// warning("row(s) of X will be rescaled")
3355/// X <- X / s
3356/// }
3357/// X <- X[, ord]
3358/// s3 <- sqrt(1/3)
3359/// if(!add)
3360/// {
3361/// oldpty <- par("pty")
3362/// on.exit(par(pty=oldpty))
3363/// par(pty="s")
3364/// plot(c(-s3, s3), c(0.5-s3, 0.5+s3), type="n", axes=FALSE,
3365/// xlab="", ylab="")
3366/// polygon(c(0, -s3, s3), c(1, 0, 0), density=0)
3367/// lab <- NULL
3368/// if(!is.null(dn <- dimnames(X))) lab <- dn[[2]]
3369/// if(length(lab) < 3) lab <- as.character(1:3)
3370/// eps <- 0.05 * lcex
3371/// text(c(0, s3+eps*0.7, -s3-eps*0.7),
3372/// c(1+eps, -0.1*eps, -0.1*eps), lab, cex=lcex)
3373/// }
3374/// points((X[,2] - X[,3])*s3, X[,1], ...)
3375/// }
3376///
3377/// ternary(Skye/100, ord=c(1,3,2))
3378/// ```
3379pub fn skye() -> PolarsResult<DataFrame> {
3380 CsvReader::new(Cursor::new(include_str!("Skye.csv"))).finish()
3381}
3382
3383/// # Snail Mortality Data
3384///
3385/// ## Description:
3386///
3387/// Groups of 20 snails were held for periods of 1, 2, 3 or 4 weeks in
3388/// carefully controlled conditions of temperature and relative
3389/// humidity. There were two species of snail, A and B, and the
3390/// experiment was designed as a 4 by 3 by 4 by 2 completely
3391/// randomized design. At the end of the exposure time the snails
3392/// were tested to see if they had survived; the process itself is
3393/// fatal for the animals. The object of the exercise was to model
3394/// the probability of survival in terms of the stimulus variables,
3395/// and in particular to test for differences between species.
3396///
3397/// The data are unusual in that in most cases fatalities during the
3398/// experiment were fairly small.
3399///
3400/// ## Usage:
3401///
3402/// snails
3403///
3404/// ## Format:
3405///
3406/// The data frame contains the following components:
3407///
3408/// * ‘Species’ snail species A (‘1’) or B (‘2’).
3409/// * ‘Exposure’ exposure in weeks.
3410/// * ‘Rel.Hum’ relative humidity (4 levels).
3411/// * ‘Temp’ temperature, in degrees Celsius (3 levels).
3412/// * ‘Deaths’ number of deaths.
3413/// * ‘N’ number of snails exposed.
3414///
3415/// ## Source:
3416///
3417/// Zoology Department, The University of Adelaide.
3418///
3419/// ## References:
3420///
3421/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
3422/// Statistics with S-PLUS._ Fourth Edition. Springer.
3423pub fn snails() -> PolarsResult<DataFrame> {
3424 CsvReader::new(Cursor::new(include_str!("snails.csv"))).finish()
3425}
3426
3427/// # Returns of the Standard and Poors 500
3428///
3429/// ## Description:
3430///
3431/// Returns of the Standard and Poors 500 Index in the 1990's
3432///
3433/// ## Usage:
3434///
3435/// SP500
3436///
3437/// ## Format:
3438///
3439/// A vector of returns of the Standard and Poors 500 index for all
3440/// the trading days in 1990, 1991, ..., 1999.
3441///
3442/// ## References:
3443///
3444/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
3445/// Statistics with S._ Fourth edition. Springer.
3446pub fn sp500() -> PolarsResult<DataFrame> {
3447 CsvReader::new(Cursor::new(include_str!("SP500.csv"))).finish()
3448}
3449
3450/// # The Saturated Steam Pressure Data
3451///
3452/// ## Description:
3453///
3454/// Temperature and pressure in a saturated steam driven experimental
3455/// device.
3456///
3457/// ## Usage:
3458///
3459/// steam
3460///
3461/// ## Format:
3462///
3463/// The data frame contains the following components:
3464///
3465/// * ‘Temp’ temperature, in degrees Celsius.
3466/// * ‘Press’ pressure, in Pascals.
3467///
3468/// ## Source:
3469///
3470/// N.R. Draper and H. Smith (1981) _Applied Regression Analysis._
3471/// Wiley, pp. 518-9.
3472///
3473/// ## References:
3474///
3475/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
3476/// Statistics with S-PLUS._ Fourth Edition. Springer.
3477pub fn steam() -> PolarsResult<DataFrame> {
3478 CsvReader::new(Cursor::new(include_str!("steam.csv"))).finish()
3479}
3480
3481const STORMER: &'static str = include_str!("stormer.csv");
3482
3483/// # The Stormer Viscometer Data
3484///
3485/// ## Description:
3486///
3487/// The stormer viscometer measures the viscosity of a fluid by
3488/// measuring the time taken for an inner cylinder in the mechanism to
3489/// perform a fixed number of revolutions in response to an actuating
3490/// weight. The viscometer is calibrated by measuring the time taken
3491/// with varying weights while the mechanism is suspended in fluids of
3492/// accurately known viscosity. The data comes from such a
3493/// calibration, and theoretical considerations suggest a nonlinear
3494/// relationship between time, weight and viscosity, of the form ‘Time
3495/// = (B1*Viscosity)/(Weight - B2) + E’ where ‘B1’ and ‘B2’ are
3496/// unknown parameters to be estimated, and ‘E’ is error.
3497///
3498/// ## Usage:
3499///
3500/// stormer
3501///
3502/// ## Format:
3503///
3504/// The data frame contains the following components:
3505///
3506/// * ‘Viscosity’ viscosity of fluid.
3507/// * ‘Wt’ actuating weight.
3508/// * ‘Time’ time taken.
3509///
3510/// ## Source:
3511///
3512/// E. J. Williams (1959) _Regression Analysis._ Wiley.
3513///
3514/// ## References:
3515///
3516/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
3517/// Statistics with S._ Fourth edition. Springer.
3518pub fn stormer() -> PolarsResult<DataFrame> {
3519 CsvReader::new(Cursor::new(STORMER)).finish()
3520}
3521
3522/// # Student Survey Data
3523///
3524/// ## Description:
3525///
3526/// This data frame contains the responses of 237 Statistics I
3527/// students at the University of Adelaide to a number of questions.
3528///
3529/// ## Usage:
3530///
3531/// survey
3532///
3533/// ## Format:
3534///
3535/// The components of the data frame are:
3536///
3537/// * ‘Sex’ The sex of the student. (Factor with levels ‘"Male"’ and
3538/// ‘"Female"’.)
3539/// * ‘Wr.Hnd’ span (distance from tip of thumb to tip of little finger
3540/// of spread hand) of writing hand, in centimetres.
3541/// * ‘NW.Hnd’ span of non-writing hand.
3542/// * ‘W.Hnd’ writing hand of student. (Factor, with levels ‘"Left"’ and
3543/// ‘"Right"’.)
3544/// * ‘Fold’ “Fold your arms! Which is on top” (Factor, with levels ‘"R
3545/// on L"’, ‘"L on R"’, ‘"Neither"’.)
3546/// * ‘Pulse’ pulse rate of student (beats per minute).
3547/// * ‘Clap’ ‘Clap your hands! Which hand is on top?’ (Factor, with
3548/// levels ‘"Right"’, ‘"Left"’, ‘"Neither"’.)
3549/// * ‘Exer’ how often the student exercises. (Factor, with levels
3550/// ‘"Freq"’ (frequently), ‘"Some"’, ‘"None"’.)
3551/// * ‘Smoke’ how much the student smokes. (Factor, levels ‘"Heavy"’,
3552/// ‘"Regul"’ (regularly), ‘"Occas"’ (occasionally), ‘"Never"’.)
3553/// * ‘Height’ height of the student in centimetres.
3554/// * ‘M.I’ whether the student expressed height in imperial
3555/// (feet/inches) or metric (centimetres/metres) units. (Factor,
3556/// levels ‘"Metric"’, ‘"Imperial"’.)
3557/// * ‘Age’ age of the student in years.
3558///
3559/// ## References:
3560///
3561/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
3562/// Statistics with S-PLUS._ Fourth Edition. Springer.
3563pub fn survey() -> PolarsResult<DataFrame> {
3564 CsvReader::new(Cursor::new(include_str!("survey.csv"))).finish()
3565}
3566
3567/// # Synthetic Classification Problem
3568///
3569/// ## Description:
3570///
3571/// The ‘synth.tr’ data frame has 250 rows and 3 columns. The
3572/// ‘synth.te’ data frame has 100 rows and 3 columns. It is intended
3573/// that ‘synth.tr’ be used from training and ‘synth.te’ for testing.
3574///
3575/// ## Usage:
3576///
3577/// synth.tr
3578/// synth.te
3579///
3580/// ## Format:
3581///
3582/// These data frames contains the following columns:
3583///
3584/// * ‘xs’ x-coordinate
3585/// * ‘ys’ y-coordinate
3586/// * ‘yc’ class, coded as 0 or 1.
3587///
3588/// ## Source:
3589///
3590/// Ripley, B.D. (1994) Neural networks and related methods for
3591/// classification (with discussion). _Journal of the Royal
3592/// Statistical Society series B_ *56*, 409-456.
3593///
3594/// Ripley, B.D. (1996) _Pattern Recognition and Neural Networks._
3595/// Cambridge: Cambridge University Press.
3596pub fn synth_te() -> PolarsResult<DataFrame> {
3597 CsvReader::new(Cursor::new(include_str!("synth.te.csv"))).finish()
3598}
3599
3600/// # Synthetic Classification Problem
3601///
3602/// ## Description:
3603///
3604/// The ‘synth.tr’ data frame has 250 rows and 3 columns. The
3605/// ‘synth.te’ data frame has 100 rows and 3 columns. It is intended
3606/// that ‘synth.tr’ be used from training and ‘synth.te’ for testing.
3607///
3608/// ## Usage:
3609///
3610/// synth.tr
3611/// synth.te
3612///
3613/// ## Format:
3614///
3615/// These data frames contains the following columns:
3616///
3617/// * ‘xs’ x-coordinate
3618/// * ‘ys’ y-coordinate
3619/// * ‘yc’ class, coded as 0 or 1.
3620///
3621/// ## Source:
3622///
3623/// Ripley, B.D. (1994) Neural networks and related methods for
3624/// classification (with discussion). _Journal of the Royal
3625/// Statistical Society series B_ *56*, 409-456.
3626///
3627/// Ripley, B.D. (1996) _Pattern Recognition and Neural Networks._
3628/// Cambridge: Cambridge University Press.
3629pub fn synth_tr() -> PolarsResult<DataFrame> {
3630 CsvReader::new(Cursor::new(include_str!("synth.tr.csv"))).finish()
3631}
3632
3633/// # Spatial Topographic Data
3634///
3635/// ## Description:
3636///
3637/// The ‘topo’ data frame has 52 rows and 3 columns, of topographic
3638/// heights within a 310 feet square.
3639///
3640/// ## Usage:
3641///
3642/// topo
3643///
3644/// ## Format:
3645///
3646/// This data frame contains the following columns:
3647///
3648/// * ‘x’ x coordinates (units of 50 feet)
3649/// * ‘y’ y coordinates (units of 50 feet)
3650/// * ‘z’ heights (feet)
3651///
3652/// ## Source:
3653///
3654/// Davis, J.C. (1973) _Statistics and Data Analysis in Geology._
3655/// Wiley.
3656///
3657/// ## References:
3658///
3659/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
3660/// Statistics with S._ Fourth edition. Springer.
3661pub fn topo() -> PolarsResult<DataFrame> {
3662 CsvReader::new(Cursor::new(include_str!("topo.csv"))).finish()
3663}
3664
3665/// # Effect of Swedish Speed Limits on Accidents
3666///
3667/// ## Description:
3668///
3669/// An experiment was performed in Sweden in 1961-2 to assess the
3670/// effect of a speed limit on the motorway accident rate. The
3671/// experiment was conducted on 92 days in each year, matched so that
3672/// day ‘j’ in 1962 was comparable to day ‘j’ in 1961. On some days
3673/// the speed limit was in effect and enforced, while on other days
3674/// there was no speed limit and cars tended to be driven faster. The
3675/// speed limit days tended to be in contiguous blocks.
3676///
3677/// ## Usage:
3678///
3679/// Traffic
3680///
3681/// ## Format:
3682///
3683/// This data frame contains the following columns:
3684///
3685/// * ‘year’ 1961 or 1962.
3686/// * ‘day’ of year.
3687/// * ‘limit’ was there a speed limit?
3688/// * ‘y’ traffic accident count for that day.
3689///
3690/// ## Source:
3691///
3692/// Svensson, A. (1981) On the goodness-of-fit test for the
3693/// multiplicative Poisson model. _Annals of Statistics,_ *9*,
3694/// 697-704.
3695///
3696/// ## References:
3697///
3698/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
3699/// Statistics with S-PLUS._ Fourth Edition. Springer.
3700pub fn traffic() -> PolarsResult<DataFrame> {
3701 CsvReader::new(Cursor::new(include_str!("Traffic.csv"))).finish()
3702}
3703
3704/// # Nutritional and Marketing Information on US Cereals
3705///
3706/// ## Description:
3707///
3708/// The ‘UScereal’ data frame has 65 rows and 11 columns. The data
3709/// come from the 1993 ASA Statistical Graphics Exposition, and are
3710/// taken from the mandatory F&DA food label. The data have been
3711/// normalized here to a portion of one American cup.
3712///
3713/// ## Usage:
3714///
3715/// UScereal
3716///
3717/// ## Format:
3718///
3719/// This data frame contains the following columns:
3720///
3721/// * ‘mfr’ Manufacturer, represented by its first initial: G=General
3722/// Mills, K=Kelloggs, N=Nabisco, P=Post, Q=Quaker Oats,
3723/// R=Ralston Purina.
3724/// * ‘calories’ number of calories in one portion.
3725/// * ‘protein’ grams of protein in one portion.
3726/// * ‘fat’ grams of fat in one portion.
3727/// * ‘sodium’ milligrams of sodium in one portion.
3728/// * ‘fibre’ grams of dietary fibre in one portion.
3729/// * ‘carbo’ grams of complex carbohydrates in one portion.
3730/// * ‘sugars’ grams of sugars in one portion.
3731/// * ‘shelf’ display shelf (1, 2, or 3, counting from the floor).
3732/// * ‘potassium’ grams of potassium.
3733/// * ‘vitamins’ vitamins and minerals (none, enriched, or 100%).
3734///
3735/// ## Source:
3736///
3737/// The original data are available at
3738///<http://lib.stat.cmu.edu/datasets/1993.expo/>.
3739///
3740/// ## References:
3741///
3742/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
3743/// Statistics with S-PLUS._ Fourth Edition. Springer.
3744pub fn uscereal() -> PolarsResult<DataFrame> {
3745 CsvReader::new(Cursor::new(include_str!("UScereal.csv"))).finish()
3746}
3747
3748/// # The Effect of Punishment Regimes on Crime Rates
3749///
3750/// ## Description:
3751///
3752/// Criminologists are interested in the effect of punishment regimes
3753/// on crime rates. This has been studied using aggregate data on 47
3754/// states of the USA for 1960 given in this data frame. The
3755/// variables seem to have been re-scaled to convenient numbers.
3756///
3757/// ## Usage:
3758///
3759/// UScrime
3760///
3761/// ## Format:
3762///
3763/// This data frame contains the following columns:
3764///
3765/// * ‘M’ percentage of males aged 14-24.
3766/// * ‘So’ indicator variable for a Southern state.
3767/// * ‘Ed’ mean years of schooling.
3768/// * ‘Po1’ police expenditure in 1960.
3769/// * ‘Po2’ police expenditure in 1959.
3770/// * ‘LF’ labour force participation rate.
3771/// * ‘M.F’ number of males per 1000 females.
3772/// * ‘Pop’ state population.
3773/// * ‘NW’ number of non-whites per 1000 people.
3774/// * ‘U1’ unemployment rate of urban males 14-24.
3775/// * ‘U2’ unemployment rate of urban males 35-39.
3776/// * ‘GDP’ gross domestic product per head.
3777/// * ‘Ineq’ income inequality.
3778/// * ‘Prob’ probability of imprisonment.
3779/// * ‘Time’ average time served in state prisons.
3780/// * ‘y’ rate of crimes in a particular category per head of
3781/// population.
3782///
3783/// ## Source:
3784///
3785/// Ehrlich, I. (1973) Participation in illegitimate activities: a
3786/// theoretical and empirical investigation. _Journal of Political
3787/// Economy_, *81*, 521-565.
3788///
3789/// Vandaele, W. (1978) Participation in illegitimate activities:
3790/// Ehrlich revisited. In _Deterrence and Incapacitation_, eds A.
3791/// Blumstein, J. Cohen and D. Nagin, pp. 270-335. US National
3792/// Academy of Sciences.
3793///
3794/// ## References:
3795///
3796/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
3797/// Statistics with S-PLUS._ Fourth Edition. Springer.
3798pub fn uscrime() -> PolarsResult<DataFrame> {
3799 CsvReader::new(Cursor::new(include_str!("UScrime.csv"))).finish()
3800}
3801
3802/// # Veteran's Administration Lung Cancer Trial
3803///
3804/// ## Description:
3805///
3806/// Veteran's Administration lung cancer trial from Kalbfleisch &
3807/// Prentice.
3808///
3809/// ## Usage:
3810///
3811/// VA
3812///
3813/// ## Format:
3814///
3815/// A data frame with columns:
3816///
3817/// * ‘stime’ survival or follow-up time in days.
3818/// * ‘status’ dead or censored.
3819/// * ‘treat’ treatment: standard or test.
3820/// * ‘age’ patient's age in years.
3821/// * ‘Karn’ Karnofsky score of patient's performance on a scale of 0 to
3822/// 100.
3823/// * ‘diag.time’ times since diagnosis in months at entry to trial.
3824/// * ‘cell’ one of four cell types.
3825/// * ‘prior’ prior therapy?
3826///
3827/// ## Source:
3828///
3829/// Kalbfleisch, J.D. and Prentice R.L. (1980) _The Statistical
3830/// Analysis of Failure Time Data._ Wiley.
3831///
3832/// ## References:
3833///
3834/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
3835/// Statistics with S._ Fourth edition. Springer.
3836pub fn va() -> PolarsResult<DataFrame> {
3837 CsvReader::new(Cursor::new(include_str!("VA.csv"))).finish()
3838}
3839
3840/// # Counts of Waders at 15 Sites in South Africa
3841///
3842/// ## Description:
3843///
3844/// The ‘waders’ data frame has 15 rows and 19 columns. The entries
3845/// are counts of waders in summer.
3846///
3847/// ## Usage:
3848///
3849/// waders
3850///
3851/// ## Format:
3852///
3853/// This data frame contains the following columns (species)
3854///
3855/// * ‘S1’ Oystercatcher
3856/// * ‘S2’ White-fronted Plover
3857/// * ‘S3’ Kitt Lutz's Plover
3858/// * ‘S4’ Three-banded Plover
3859/// * ‘S5’ Grey Plover
3860/// * ‘S6’ Ringed Plover
3861/// * ‘S7’ Bar-tailed Godwit
3862/// * ‘S8’ Whimbrel
3863/// * ‘S9’ Marsh Sandpiper
3864/// * ‘S10’ Greenshank
3865/// * ‘S11’ Common Sandpiper
3866/// * ‘S12’ Turnstone
3867/// * ‘S13’ Knot
3868/// * ‘S14’ Sanderling
3869/// * ‘S15’ Little Stint
3870/// * ‘S16’ Curlew Sandpiper
3871/// * ‘S17’ Ruff
3872/// * ‘S18’ Avocet
3873/// * ‘S19’ Black-winged Stilt
3874///
3875/// The rows are the sites:
3876///
3877/// * A = Namibia North coast
3878/// * B = Namibia North wetland
3879/// * C = Namibia South coast
3880/// * D = Namibia South wetland
3881/// * E = Cape North coast
3882/// * F = Cape North wetland
3883/// * G = Cape West coast
3884/// * H = Cape West wetland
3885/// * I = Cape South coast
3886/// * J = Cape South wetland
3887/// * K = Cape East coast
3888/// * L = Cape East wetland
3889/// * M = Transkei coast
3890/// * N = Natal coast
3891/// * O = Natal wetland
3892///
3893/// ## Source:
3894///
3895/// J.C. Gower and D.J. Hand (1996) _Biplots_ Chapman & Hall Table
3896/// 9.1. Quoted as from:
3897///
3898/// R.W. Summers, L.G. Underhill, D.J. Pearson and D.A. Scott (1987)
3899/// Wader migration systems in south and eastern Africa and western
3900/// Asia. _Wader Study Group Bulletin_ *49* Supplement, 15-34.
3901///
3902/// ## Examples:
3903///
3904/// plot(corresp(waders, nf=2))
3905pub fn waders() -> PolarsResult<DataFrame> {
3906 CsvReader::new(Cursor::new(include_str!("waders.csv"))).finish()
3907}
3908
3909/// # House Insulation: Whiteside's Data
3910///
3911/// ## Description:
3912///
3913/// Mr Derek Whiteside of the UK Building Research Station recorded
3914/// the weekly gas consumption and average external temperature at his
3915/// own house in south-east England for two heating seasons, one of 26
3916/// weeks before, and one of 30 weeks after cavity-wall insulation was
3917/// installed. The object of the exercise was to assess the effect of
3918/// the insulation on gas consumption.
3919///
3920/// ## Usage:
3921///
3922/// whiteside
3923///
3924/// ## Format:
3925///
3926/// The ‘whiteside’ data frame has 56 rows and 3 columns.:
3927///
3928/// * ‘Insul’ A factor, before or after insulation.
3929/// * ‘Temp’ Purportedly the average outside temperature in degrees
3930/// Celsius. (These values is far too low for any 56-week period
3931/// in the 1960s in South-East England. It might be the weekly
3932/// average of daily minima.)
3933/// * ‘Gas’ The weekly gas consumption in 1000s of cubic feet.
3934///
3935/// ## Source:
3936///
3937/// A data set collected in the 1960s by Mr Derek Whiteside of the UK
3938/// Building Research Station. Reported by
3939///
3940/// Hand, D. J., Daly, F., McConway, K., Lunn, D. and Ostrowski, E.
3941/// eds (1993) _A Handbook of Small Data Sets._ Chapman & Hall, p. 69.
3942///
3943/// ## References:
3944///
3945/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
3946/// Statistics with S._ Fourth edition. Springer.
3947///
3948/// ## Examples:
3949///
3950/// ```r
3951/// require(lattice)
3952/// xyplot(Gas ~ Temp | Insul, whiteside, panel =
3953/// function(x, y, ...) {
3954/// panel.xyplot(x, y, ...)
3955/// panel.lmline(x, y, ...)
3956/// }, xlab = "Average external temperature (deg. C)",
3957/// ylab = "Gas consumption (1000 cubic feet)", aspect = "xy",
3958/// strip = function(...) strip.default(..., style = 1))
3959///
3960/// gasB <- lm(Gas ~ Temp, whiteside, subset = Insul=="Before")
3961/// gasA <- update(gasB, subset = Insul=="After")
3962/// summary(gasB)
3963/// summary(gasA)
3964/// gasBA <- lm(Gas ~ Insul/Temp - 1, whiteside)
3965/// summary(gasBA)
3966///
3967/// gasQ <- lm(Gas ~ Insul/(Temp + I(Temp^2)) - 1, whiteside)
3968/// coef(summary(gasQ))
3969///
3970/// gasPR <- lm(Gas ~ Insul + Temp, whiteside)
3971/// anova(gasPR, gasBA)
3972/// options(contrasts = c("contr.treatment", "contr.poly"))
3973/// gasBA1 <- lm(Gas ~ Insul*Temp, whiteside)
3974/// coef(summary(gasBA1))
3975/// ```
3976pub fn whiteside() -> PolarsResult<DataFrame> {
3977 CsvReader::new(Cursor::new(include_str!("whiteside.csv"))).finish()
3978}
3979
3980/// # Weight Loss Data from an Obese Patient
3981///
3982/// # Description:
3983///
3984/// The data frame gives the weight, in kilograms, of an obese patient
3985/// at 52 time points over an 8 month period of a weight
3986/// rehabilitation programme.
3987///
3988/// # Usage:
3989///
3990/// wtloss
3991///
3992/// # Format:
3993///
3994/// This data frame contains the following columns:
3995///
3996/// * ‘Days’ time in days since the start of the programme.
3997/// * ‘Weight’ weight in kilograms of the patient.
3998///
3999/// ## Source:
4000///
4001/// Dr T. Davies, Adelaide.
4002///
4003/// ## References:
4004///
4005/// Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
4006/// Statistics with S._ Fourth edition. Springer.
4007///
4008/// ## Examples:
4009///
4010/// ```r
4011/// ## IGNORE_RDIFF_BEGIN
4012/// wtloss.fm <- nls(Weight ~ b0 + b1*2^(-Days/th),
4013/// data = wtloss, start = list(b0=90, b1=95, th=120))
4014/// wtloss.fm
4015/// ## IGNORE_RDIFF_END
4016/// plot(wtloss)
4017/// with(wtloss, lines(Days, fitted(wtloss.fm)))
4018/// ```
4019pub fn wtloss() -> PolarsResult<DataFrame> {
4020 CsvReader::new(Cursor::new(include_str!("wtloss.csv"))).finish()
4021}