Data Frames and Exploratory Data Analysis

Install packages

Install dataset package:

install.packages("datasets")

## Installing package into '/home/dp/R/x86_64-pc-linux-gnu-library/3.5'
## (as 'lib' is unspecified)

## Warning: package 'datasets' is not available (for R version 3.5.3)

## Warning: package 'datasets' is a base package, and should not be updated

# invoke interactive help (warn: it's worked only in IDE)
?datasets # short version
library(help = "datasets") # or complete list of datasets

Loading / attaching package:

library(datasets)

Create data frame

Create new data frame:

df <- data.frame(
  col1 = 1:10,
  col2 = seq(1, 20, by = 2),
  col3 = rep(Sys.time(), times = 10)
)

df

…or load existing data frame:

library(datasets)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# load
stocks <- EuStockMarkets 

# transform time-series to data table
stocks <- as_tibble(stocks)
stocks

Discovery data frame structure

View data:

# You can code in this style
as_tibble(head(stocks))

# ...or this
stocks %>% tail %>% as_tibble

Data slices:

stocks[1, ] %>% as_tibble

stocks[10:20, ] %>% as_tibble

stocks[10:20, c(1, 3)] %>% as_tibble

Data frame metadata:

print(dim(stocks))

## [1] 1860    4

print(str(stocks))

## Classes 'tbl_df', 'tbl' and 'data.frame':    1860 obs. of  4 variables:
##  $ DAX : num  1629 1614 1607 1621 1618 ...
##  $ SMI : num  1678 1688 1679 1684 1687 ...
##  $ CAC : num  1773 1750 1718 1708 1723 ...
##  $ FTSE: num  2444 2460 2448 2470 2485 ...
## NULL

print(summary(stocks))

##       DAX            SMI            CAC            FTSE     
##  Min.   :1402   Min.   :1587   Min.   :1611   Min.   :2281  
##  1st Qu.:1744   1st Qu.:2166   1st Qu.:1875   1st Qu.:2843  
##  Median :2141   Median :2796   Median :1992   Median :3247  
##  Mean   :2531   Mean   :3376   Mean   :2228   Mean   :3566  
##  3rd Qu.:2722   3rd Qu.:3812   3rd Qu.:2274   3rd Qu.:3994  
##  Max.   :6186   Max.   :8412   Max.   :4388   Max.   :6179

skimr::skim_to_list(stocks)

## $numeric
## # A tibble: 4 x 12
##   variable missing complete n     mean    sd       p0       p25      p50      p75    p100    hist   
## * <chr>    <chr>   <chr>    <chr> <chr>   <chr>    <chr>    <chr>    <chr>    <chr>  <chr>   <chr>  
## 1 CAC      0       1860     1860  2227.83 " 580.3… "1611  … 1875.15  "1992.3… 2274.… "4388.… ▇▆▁▂▁▁…
## 2 DAX      0       1860     1860  2530.66 1084.79  1402.34  "1744.1… 2140.56  2722.… 6186.09 ▇▇▁▁▂▁…
## 3 FTSE     0       1860     1860  3565.64 " 976.7… "2281  … 2843.15  "3246.6… 3993.… "6179 … ▆▇▅▃▂▂…
## 4 SMI      0       1860     1860  3376.22 1663.03  "1587.4… 2165.62  2796.35  3812.… "8412 … ▇▇▃▁▂▁…

Descriptive statistics in R

Statistics types:

Center: mean(), median()
Spread: sd(), var(), IQR(), mad()
Range: min(), max(), quantile()
Position: first(), last(), nth()
Count: n(), n_distinct()

min(stocks$DAX)

## [1] 1402.34

mean(stocks$CAC)

## [1] 2227.828

max(stocks$FTSE)

## [1] 6179

library(tidyr)

funs <- list(min, mean, median, sd, var, IQR, max)

stocks %>% 
  gather(stock, close) %>% 
  group_by(stock) %>% 
  summarise_all(funs) %>% 
  as_tibble

Vizualization

library(ggplot2)

df <- stocks %>% gather(stock, close)

Histogram

hist(stocks$DAX)

Boxplot

boxplot(df$close ~ df$stock)

boxplot(log(df$close) ~ df$stock)

Scatterplot

plot(stocks$DAX, stocks$SMI)

Correlation plot

library(corrplot)

## corrplot 0.84 loaded

fake_trades <- stocks$DAX - rnorm(nrow(stocks), mean = 1000, sd = 2000)
              
M <- cor(stocks %>% mutate(FAKE = fake_trades))
print(M)

##            DAX       SMI       CAC      FTSE      FAKE
## DAX  1.0000000 0.9911539 0.9662274 0.9751778 0.4754890
## SMI  0.9911539 1.0000000 0.9468139 0.9899691 0.4734703
## CAC  0.9662274 0.9468139 1.0000000 0.9157265 0.4462341
## FTSE 0.9751778 0.9899691 0.9157265 1.0000000 0.4678175
## FAKE 0.4754890 0.4734703 0.4462341 0.4678175 1.0000000

corrplot(M)

corrplot(M,
         method = "square",
         order = "hclust", diag = F, tl.col = "black", tl.cex = 0.7,
         title = "Stocks correlation matrix",
         mar = c(0,1,2,0))

Density plot

plot(density(stocks$DAX))

plot(density(na.omit(stocks$DAX - lag(stocks$DAX))))

Conclusion

Back to Course program

Data Frames and Exploratory Data Analysis

Dmitry Petukhov

Install packages

Create data frame

Discovery data frame structure

Descriptive statistics in R

Vizualization

Histogram

Boxplot

Scatterplot

Correlation plot

Density plot

Conclusion