Install packages

Install dataset package:

install.packages("datasets") 
## Installing package into '/home/dp/R/x86_64-pc-linux-gnu-library/3.5'
## (as 'lib' is unspecified)
## Warning: package 'datasets' is not available (for R version 3.5.3)
## Warning: package 'datasets' is a base package, and should not be updated
# invoke interactive help (warn: it's worked only in IDE)
?datasets # short version
library(help = "datasets") # or complete list of datasets

Loading / attaching package:

library(datasets)

Create data frame

Create new data frame:

df <- data.frame(
  col1 = 1:10,
  col2 = seq(1, 20, by = 2),
  col3 = rep(Sys.time(), times = 10)
)

df

…or load existing data frame:

library(datasets)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# load
stocks <- EuStockMarkets 

# transform time-series to data table
stocks <- as_tibble(stocks)
stocks

Discovery data frame structure

View data:

# You can code in this style
as_tibble(head(stocks))
# ...or this
stocks %>% tail %>% as_tibble

Data slices:

stocks[1, ] %>% as_tibble
stocks[10:20, ] %>% as_tibble
stocks[10:20, c(1, 3)] %>% as_tibble

Data frame metadata:

print(dim(stocks))
## [1] 1860    4
print(str(stocks))
## Classes 'tbl_df', 'tbl' and 'data.frame':    1860 obs. of  4 variables:
##  $ DAX : num  1629 1614 1607 1621 1618 ...
##  $ SMI : num  1678 1688 1679 1684 1687 ...
##  $ CAC : num  1773 1750 1718 1708 1723 ...
##  $ FTSE: num  2444 2460 2448 2470 2485 ...
## NULL
print(summary(stocks))
##       DAX            SMI            CAC            FTSE     
##  Min.   :1402   Min.   :1587   Min.   :1611   Min.   :2281  
##  1st Qu.:1744   1st Qu.:2166   1st Qu.:1875   1st Qu.:2843  
##  Median :2141   Median :2796   Median :1992   Median :3247  
##  Mean   :2531   Mean   :3376   Mean   :2228   Mean   :3566  
##  3rd Qu.:2722   3rd Qu.:3812   3rd Qu.:2274   3rd Qu.:3994  
##  Max.   :6186   Max.   :8412   Max.   :4388   Max.   :6179
skimr::skim_to_list(stocks)
## $numeric
## # A tibble: 4 x 12
##   variable missing complete n     mean    sd       p0       p25      p50      p75    p100    hist   
## * <chr>    <chr>   <chr>    <chr> <chr>   <chr>    <chr>    <chr>    <chr>    <chr>  <chr>   <chr>  
## 1 CAC      0       1860     1860  2227.83 " 580.3… "1611  … 1875.15  "1992.3… 2274.… "4388.… ▇▆▁▂▁▁…
## 2 DAX      0       1860     1860  2530.66 1084.79  1402.34  "1744.1… 2140.56  2722.… 6186.09 ▇▇▁▁▂▁…
## 3 FTSE     0       1860     1860  3565.64 " 976.7… "2281  … 2843.15  "3246.6… 3993.… "6179 … ▆▇▅▃▂▂…
## 4 SMI      0       1860     1860  3376.22 1663.03  "1587.4… 2165.62  2796.35  3812.… "8412 … ▇▇▃▁▂▁…

Descriptive statistics in R

Statistics types:

  • Center: mean(), median()
  • Spread: sd(), var(), IQR(), mad()
  • Range: min(), max(), quantile()
  • Position: first(), last(), nth()
  • Count: n(), n_distinct()
min(stocks$DAX)
## [1] 1402.34
mean(stocks$CAC)
## [1] 2227.828
max(stocks$FTSE)
## [1] 6179
library(tidyr)

funs <- list(min, mean, median, sd, var, IQR, max)

stocks %>% 
  gather(stock, close) %>% 
  group_by(stock) %>% 
  summarise_all(funs) %>% 
  as_tibble

See also: https://www.statsandr.com/blog/descriptive-statistics-in-r/

Vizualization

library(ggplot2)

df <- stocks %>% gather(stock, close)

Histogram

hist(stocks$DAX)

Boxplot

boxplot(df$close ~ df$stock)

boxplot(log(df$close) ~ df$stock)

Scatterplot

plot(stocks$DAX, stocks$SMI)

Correlation plot

library(corrplot)
## corrplot 0.84 loaded
fake_trades <- stocks$DAX - rnorm(nrow(stocks), mean = 1000, sd = 2000)
              
M <- cor(stocks %>% mutate(FAKE = fake_trades))
print(M)
##            DAX       SMI       CAC      FTSE      FAKE
## DAX  1.0000000 0.9911539 0.9662274 0.9751778 0.4754890
## SMI  0.9911539 1.0000000 0.9468139 0.9899691 0.4734703
## CAC  0.9662274 0.9468139 1.0000000 0.9157265 0.4462341
## FTSE 0.9751778 0.9899691 0.9157265 1.0000000 0.4678175
## FAKE 0.4754890 0.4734703 0.4462341 0.4678175 1.0000000
corrplot(M)

corrplot(M,
         method = "square",
         order = "hclust", diag = F, tl.col = "black", tl.cex = 0.7,
         title = "Stocks correlation matrix",
         mar = c(0,1,2,0))

Density plot

plot(density(stocks$DAX))

plot(density(na.omit(stocks$DAX - lag(stocks$DAX))))