Install dataset package:
install.packages("datasets")
## Installing package into '/home/dp/R/x86_64-pc-linux-gnu-library/3.5'
## (as 'lib' is unspecified)
## Warning: package 'datasets' is not available (for R version 3.5.3)
## Warning: package 'datasets' is a base package, and should not be updated
# invoke interactive help (warn: it's worked only in IDE)
?datasets # short version
library(help = "datasets") # or complete list of datasets
Loading / attaching package:
library(datasets)
Create new data frame:
df <- data.frame(
col1 = 1:10,
col2 = seq(1, 20, by = 2),
col3 = rep(Sys.time(), times = 10)
)
df
…or load existing data frame:
library(datasets)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# load
stocks <- EuStockMarkets
# transform time-series to data table
stocks <- as_tibble(stocks)
stocks
View data:
# You can code in this style
as_tibble(head(stocks))
# ...or this
stocks %>% tail %>% as_tibble
Data slices:
stocks[1, ] %>% as_tibble
stocks[10:20, ] %>% as_tibble
stocks[10:20, c(1, 3)] %>% as_tibble
Data frame metadata:
print(dim(stocks))
## [1] 1860 4
print(str(stocks))
## Classes 'tbl_df', 'tbl' and 'data.frame': 1860 obs. of 4 variables:
## $ DAX : num 1629 1614 1607 1621 1618 ...
## $ SMI : num 1678 1688 1679 1684 1687 ...
## $ CAC : num 1773 1750 1718 1708 1723 ...
## $ FTSE: num 2444 2460 2448 2470 2485 ...
## NULL
print(summary(stocks))
## DAX SMI CAC FTSE
## Min. :1402 Min. :1587 Min. :1611 Min. :2281
## 1st Qu.:1744 1st Qu.:2166 1st Qu.:1875 1st Qu.:2843
## Median :2141 Median :2796 Median :1992 Median :3247
## Mean :2531 Mean :3376 Mean :2228 Mean :3566
## 3rd Qu.:2722 3rd Qu.:3812 3rd Qu.:2274 3rd Qu.:3994
## Max. :6186 Max. :8412 Max. :4388 Max. :6179
skimr::skim_to_list(stocks)
## $numeric
## # A tibble: 4 x 12
## variable missing complete n mean sd p0 p25 p50 p75 p100 hist
## * <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 CAC 0 1860 1860 2227.83 " 580.3… "1611 … 1875.15 "1992.3… 2274.… "4388.… ▇▆▁▂▁▁…
## 2 DAX 0 1860 1860 2530.66 1084.79 1402.34 "1744.1… 2140.56 2722.… 6186.09 ▇▇▁▁▂▁…
## 3 FTSE 0 1860 1860 3565.64 " 976.7… "2281 … 2843.15 "3246.6… 3993.… "6179 … ▆▇▅▃▂▂…
## 4 SMI 0 1860 1860 3376.22 1663.03 "1587.4… 2165.62 2796.35 3812.… "8412 … ▇▇▃▁▂▁…
Statistics types:
mean(), median()
sd(), var(), IQR(), mad()
min(), max(), quantile()
first(), last(), nth()
n(), n_distinct()
min(stocks$DAX)
## [1] 1402.34
mean(stocks$CAC)
## [1] 2227.828
max(stocks$FTSE)
## [1] 6179
library(tidyr)
funs <- list(min, mean, median, sd, var, IQR, max)
stocks %>%
gather(stock, close) %>%
group_by(stock) %>%
summarise_all(funs) %>%
as_tibble
See also: https://www.statsandr.com/blog/descriptive-statistics-in-r/
library(ggplot2)
df <- stocks %>% gather(stock, close)
hist(stocks$DAX)
boxplot(df$close ~ df$stock)
boxplot(log(df$close) ~ df$stock)
plot(stocks$DAX, stocks$SMI)
library(corrplot)
## corrplot 0.84 loaded
fake_trades <- stocks$DAX - rnorm(nrow(stocks), mean = 1000, sd = 2000)
M <- cor(stocks %>% mutate(FAKE = fake_trades))
print(M)
## DAX SMI CAC FTSE FAKE
## DAX 1.0000000 0.9911539 0.9662274 0.9751778 0.4754890
## SMI 0.9911539 1.0000000 0.9468139 0.9899691 0.4734703
## CAC 0.9662274 0.9468139 1.0000000 0.9157265 0.4462341
## FTSE 0.9751778 0.9899691 0.9157265 1.0000000 0.4678175
## FAKE 0.4754890 0.4734703 0.4462341 0.4678175 1.0000000
corrplot(M)
corrplot(M,
method = "square",
order = "hclust", diag = F, tl.col = "black", tl.cex = 0.7,
title = "Stocks correlation matrix",
mar = c(0,1,2,0))
plot(density(stocks$DAX))
plot(density(na.omit(stocks$DAX - lag(stocks$DAX))))