# Pack function: install and load more than one R packages.
# Check to see if packages are installed.
# Install them if they are not,
# Then load them into the R session.
pack <- function(lib){
new.lib <- lib[!(lib %in%
installed.packages()[, "Package"])]
if (length(new.lib))
install.packages(new.lib, dependencies = TRUE)
sapply(lib, require, character.only = TRUE)
}
# usage
packages <- c('astsa', 'xts', 'tidyquant', 'quantmod', 'tidyverse', 'dplyr',
'pander', 'fpp2', 'broom', 'caret', 'factoextra', 'corrplot',
'e1071', 'rugarch')
pack(packages)
## astsa xts tidyquant quantmod tidyverse dplyr pander
## TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## fpp2 broom caret factoextra corrplot e1071 rugarch
## TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# get (read-in) data for the last 10 years
start = as.Date("2011-11-30") # start date
end = as.Date("2021-11-30") # end date
# data might not be available for entirety of date range
# but a 10 year look back is done to accommodate full size and scope
getSymbols(c("LTC-USD"),
src = "yahoo",
from = start,
to = end)
## [1] "LTC-USD"
# cast litecoin time series into dataframe
litecoin_df <- data.frame(`LTC-USD`)
colnames(litecoin_df) <- c("Open", "High", "Low", "Close", "Volume", "Adjusted")
litecoin.ts <- tq_get("LTC-USD", from = "2011-11-30", to = "2021-11-30") %>%
select(adjusted) %>% # adjusted price (more accurate than close price)
ts(.) # turning it into a time series object
ltc_xts <- as.xts(litecoin_df)
str(litecoin_df); str(litecoin.ts)
## 'data.frame': 2632 obs. of 6 variables:
## $ Open : num 5.09 5.07 4.69 4.33 4.26 ...
## $ High : num 5.17 5.07 4.76 4.62 4.3 ...
## $ Low : num 4.97 4.58 4.25 4.2 4.15 ...
## $ Close : num 5.06 4.69 4.33 4.29 4.25 ...
## $ Volume : num 3071840 4569260 3917450 5490660 2931220 ...
## $ Adjusted: num 5.06 4.69 4.33 4.29 4.25 ...
## Time-Series [1:2632, 1] from 1 to 2632: 5.06 4.69 4.33 4.29 4.25 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr "adjusted"
cat("Dimensions of dataset:", dim(litecoin_df)) # dimensions of dataset
## Dimensions of dataset: 2632 6
cat("There are", sum(is.na(litecoin_df)), 'missing values in the dataset. \n')
## There are 24 missing values in the dataset.
# list columns pertaining to missing values in dataframe
list_na <- colnames(litecoin_df)[ apply(litecoin_df, 2, anyNA)]; list_na
## [1] "Open" "High" "Low" "Close" "Volume" "Adjusted"
# remove missing values
litecoin_df <- litecoin_df[complete.cases(litecoin_df),]
litecoin.ts <- litecoin.ts[complete.cases(litecoin.ts),]
ltc_xts <- ltc_xts[complete.cases(ltc_xts),]
# Check for missing values after complete cases (removal)
cat("\n There are", sum(is.na(litecoin_df)), 'missing values in the dataset.\n',
'New dimensions of dataset:', dim(litecoin_df))# dimensions of dataset
##
## There are 0 missing values in the dataset.
## New dimensions of dataset: 2628 6
At the time of the analysis, the dataset has 2628 rows and 6 columns of data.
# inspect the first and last few rows of data
head(litecoin_df, 8)
tail(litecoin_df, 8)
summary(litecoin_df[,6]) # summary stats of adjusted close prices
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.157 3.879 46.324 64.075 87.115 386.451
# histogram distributions
par(mfrow = c(2,3), mar = c(2, 2, 5, 2))
options(scipen=999)
for (i in 1:ncol(litecoin_df)) {
hist(litecoin_df[,i],
xlab = names(litecoin_df[i]), ylim=c(0,1600),
main = paste(names(litecoin_df[i]), " - Histogram"),
col="gray60")
}
# boxplot distributions
par(mfrow = c(2, 3),
mar = c(2, 2, 5, 2))
for (i in 1:ncol(litecoin_df)) {
boxplot(litecoin_df[,i],
ylab = names(litecoin_df[i]),
main = paste(names(litecoin_df[i]), "- Boxplot"), horizontal=TRUE,
col="gray")
}
The OHLC (open, high, low, close) and adjusted prices exhibit long-tailed distributions with a right skew; so does the volume.
Since we are interested in evaluating Litecoin’s performance as a cryptocurrency, the final (close) price would be intrinsically of interest; but, more importantly, the “adjusted closing price is considered to be a more technically accurate reflection of the true value” (Bischoff, 2019).
# test skewness by looking at mean and median relationship
mean_ltc <- round(apply(litecoin_df, 2, mean, na.rm = T),0)
median_ltc <- round(apply(litecoin_df, 2, median, na.rm = T),0)
distribution<- data.frame(mean_ltc, median_ltc)
distribution$Skewness <- ifelse(mean_ltc > 2 + median_ltc, "skewed", "normal")
distribution
# Check for exact skewness in LTC.Volume
skewValue <- apply(litecoin_df, 2, skewness, na.rm=T)
skewValue
## Open High Low Close Volume Adjusted
## 1.410603 1.453786 1.357228 1.406507 2.299834 1.406507
# Applying Box-Cox Transformation on skewed variable
trans <- preProcess(data.frame(litecoin_df), method=c("BoxCox"))
trans
## Created from 2628 samples and 6 variables
##
## Pre-processing:
## - Box-Cox transformation (6)
## - ignored (0)
##
## Lambda estimates for Box-Cox transformation:
## 0.2, 0.2, 0.2, 0.2, 0.1, 0.2
# look at and compare to transformed data
transformed <- predict(trans, data.frame(litecoin_df))
skew_transformed <- apply(transformed, 2, skewness, na.rm=T)
skew_transformed
## Open High Low Close Volume Adjusted
## -0.06181721 -0.05492312 -0.07210481 -0.06325734 -0.38318043 -0.06325734
new_skew <- data.frame(skewValue, skew_transformed)
new_skew$Skew_Variance <- ifelse(skewValue < skew_transformed, "More skewed",
"Less skewed")
new_skew
# assign correlation function call to variable
cor_ltc <- cor(litecoin_df)
# plot the correlation table (matrix)
corrplot(cor_ltc,
method="color",
col=colorRampPalette(c("yellow",
"white",
"orange"))(200),
addCoef.col = "black",
tl.col="black", tl.srt=45, type="lower")
From the correlation matrix, it can be discerned that whereas the OHLC and adjusted prices exhibit multicollinearity at \(r=1,\) their relationships with volume is much less pronounced, where \(0.56 \leq r \leq 0.58.\)
# center, scale the data, and assign to PCA variable
litecoin.pca <- prcomp(litecoin_df, center = TRUE, scale. = TRUE)
# assign to variance explained variable
var_explained <- round(litecoin.pca$sdev^2/sum((litecoin.pca$sdev)^2)*100, 4)
fviz_eig(litecoin.pca, main="Scree Plot of Six Principal Components",
xlab="Principal Components",
ylab = "Percent Variance Explained",
barcolor = "grey", barfill = "grey",
linecolor = "blue", addlabels=T,
ggtheme=theme_classic())
Principal Component | Percent Variance | Percent Change (Delta) |
---|---|---|
1 | 89.35 | |
2 | 10.46 | 78.9 |
3 | 0.11 | 10.35 |
4 | 0.07 | 0.03 |
5 | 0.01 | 0.06 |
6 | 0 | 0.01 |
Approximately 89.35% of the variance in the data is explained by the first principal component; thus, the effective dimension is 1. This is supported by and demonstrated in the scree plot and the ensuing table above. The table itself numerically demonstrates the percent variance that is explained by each respective principal component. The scree plot visually depicts “the percentage of the total variance explained by each component” (Kuhn & Johnson, 2016, p. 38).
# create new variable for sole purpose of plotting years on x-axis, not indices
litecoin_plot <- ts(as.vector(litecoin.ts), start=c(2014), frequency = 365)
tsplot(litecoin_plot, main='LTC Adjusted Closing Prices (2014 - 2021)',
xlab='Year', ylab='Adjusted Price (USD)') # plot the time series
The time series shows a clear trend with several predominant peaks and troughs, at approximately 2017 - 2018 and 2020 - 2021, respectively. To mitigate (offset) the trend, differencing will be performed. Furthermore, the autocorrelation function (ACF) and partial autocorrelation function (PACF) are examined. Whereas ACF “measures the linear predictability of the series at time \(t\), say \(x_t\) using only the value of \(x_s\)” (Shumway & Stoffer, 2019, p. 20), the PACF does the same for a truncated lag length.
Autocorrelation Function (ACF) \[\begin{eqnarray*} \rho(s,t) &=& \frac{\gamma(s,t)}{\sqrt{\gamma(s,s)\gamma(t,t)}} \end{eqnarray*}\]
where \(-1 \leq \rho(s,t) \leq 1.\)
For the sample ACF, we have:
\[\begin{eqnarray*} \rho \text{ } x(h) &=& \frac{\gamma(x(h)}{\gamma x(0)} = \frac{(X_{t+h}-\bar{X}) (X_t-\bar{X})}{\large \sum (X_t-\bar{X})^2}\\ \\ &=&\text{Corr}(X_{t+h,}X_t) \end{eqnarray*}\]
par(mfrow=c(2,1), oma = c(2,2,0,0) + 0.1, mar = c(1,4,3,1) + 0.1)
acf(litecoin_df$Adjusted, lag.max=100, main='Litecoin ACF and PACF for Adjusted Prices')
pacf(litecoin_df$Adjusted, lag.max=100, main='', ylab='PACF')
Whereas the ACF gradually tapers off, the PACF cuts off after lag 1, thereby relegating this to an AR(1) model. So we have the following:
arima(litecoin_df$Adjusted, order=c(1, 0, 0))
##
## Call:
## arima(x = litecoin_df$Adjusted, order = c(1, 0, 0))
##
## Coefficients:
## ar1 intercept
## 0.9960 64.0773
## s.e. 0.0018 30.5462
##
## sigma^2 estimated as 46.5: log likelihood = -8776.55, aic = 17559.09
\[\begin{eqnarray*} (x_t-\mu) &=& \phi_1(x_{t-1}-\mu)+\omega_t.\\ (x_t-64.0773) &=& 0.9960(x_{t-1}-64.0773) + \omega_t\\ x_t &=& 64.0773-(64.0773\times 0.9960) + 0.9960x_{t-1}+\omega_t = 0.256+0.9960x_{t-1}+\omega_t \end{eqnarray*}\]
We plot the data for the last six years (November 2014 through November 2021).
Next, we smooth the data by introducing the simple moving average (SMA), and exponential moving average (EMA), respectively, weighting the effects by 30 days (one full month).
chartSeries(litecoin_df, theme = chartTheme("white"))
addSMA(30) # smoothed out moving average by 30 days