Loading the Necessary Packages (Libraries)

# Pack function: install and load more than one  R packages.
# Check to see if packages are installed. 
# Install them if they are not, 
# Then load them into the R session.

pack <- function(lib){
    new.lib <- lib[!(lib %in% 
                     installed.packages()[, "Package"])]
    if (length(new.lib)) 
        install.packages(new.lib, dependencies = TRUE)
    sapply(lib, require, character.only = TRUE)
}

# usage
packages <- c('astsa', 'xts', 'tidyquant', 'quantmod', 'tidyverse', 'dplyr', 
              'pander', 'fpp2', 'broom', 'caret', 'factoextra', 'corrplot', 
              'e1071', 'rugarch')
pack(packages)
##      astsa        xts  tidyquant   quantmod  tidyverse      dplyr     pander 
##       TRUE       TRUE       TRUE       TRUE       TRUE       TRUE       TRUE 
##       fpp2      broom      caret factoextra   corrplot      e1071    rugarch 
##       TRUE       TRUE       TRUE       TRUE       TRUE       TRUE       TRUE

Preprocessing - Initial Steps

# get (read-in) data for the last 10 years 
start = as.Date("2011-11-30") # start date
end = as.Date("2021-11-30")   # end date

# data might not be available for entirety of date range
# but a 10 year look back is done to accommodate full size and scope
getSymbols(c("LTC-USD"), 
           src = "yahoo", 
           from = start, 
           to = end)
## [1] "LTC-USD"

Exploratory Data Analysis (EDA)

# cast litecoin time series into dataframe
litecoin_df <- data.frame(`LTC-USD`) 
colnames(litecoin_df) <- c("Open", "High", "Low", "Close", "Volume", "Adjusted")

litecoin.ts <- tq_get("LTC-USD", from = "2011-11-30", to = "2021-11-30") %>%
select(adjusted) %>% # adjusted price (more accurate than close price)
ts(.) # turning it into a time series object
ltc_xts <- as.xts(litecoin_df)
str(litecoin_df); str(litecoin.ts)
## 'data.frame':    2632 obs. of  6 variables:
##  $ Open    : num  5.09 5.07 4.69 4.33 4.26 ...
##  $ High    : num  5.17 5.07 4.76 4.62 4.3 ...
##  $ Low     : num  4.97 4.58 4.25 4.2 4.15 ...
##  $ Close   : num  5.06 4.69 4.33 4.29 4.25 ...
##  $ Volume  : num  3071840 4569260 3917450 5490660 2931220 ...
##  $ Adjusted: num  5.06 4.69 4.33 4.29 4.25 ...
##  Time-Series [1:2632, 1] from 1 to 2632: 5.06 4.69 4.33 4.29 4.25 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr "adjusted"
cat("Dimensions of dataset:", dim(litecoin_df)) # dimensions of dataset
## Dimensions of dataset: 2632 6
cat("There are", sum(is.na(litecoin_df)), 'missing values in the dataset. \n')
## There are 24 missing values in the dataset.
# list columns pertaining to missing values in dataframe
list_na <- colnames(litecoin_df)[ apply(litecoin_df, 2, anyNA)]; list_na
## [1] "Open"     "High"     "Low"      "Close"    "Volume"   "Adjusted"
# remove missing values
litecoin_df <- litecoin_df[complete.cases(litecoin_df),] 
litecoin.ts <- litecoin.ts[complete.cases(litecoin.ts),] 
ltc_xts <- ltc_xts[complete.cases(ltc_xts),] 
# Check for missing values after complete cases (removal)
cat("\n There are", sum(is.na(litecoin_df)), 'missing values in the dataset.\n',
    'New dimensions of dataset:', dim(litecoin_df))# dimensions of dataset
## 
##  There are 0 missing values in the dataset.
##  New dimensions of dataset: 2628 6

At the time of the analysis, the dataset has 2628 rows and 6 columns of data.

# inspect the first and last few rows of data
head(litecoin_df, 8)
tail(litecoin_df, 8) 

Summary Statistics

summary(litecoin_df[,6]) # summary stats of adjusted close prices
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.157   3.879  46.324  64.075  87.115 386.451

Distributions

# histogram distributions
par(mfrow = c(2,3), mar = c(2, 2, 5, 2))
options(scipen=999)

for (i in 1:ncol(litecoin_df)) {
    hist(litecoin_df[,i],
    xlab = names(litecoin_df[i]), ylim=c(0,1600),
    main = paste(names(litecoin_df[i]), " - Histogram"), 
    col="gray60")
  }

# boxplot distributions
par(mfrow = c(2, 3),
    mar = c(2, 2, 5, 2))

for (i in 1:ncol(litecoin_df)) {
    boxplot(litecoin_df[,i],
    ylab = names(litecoin_df[i]),
    main = paste(names(litecoin_df[i]), "- Boxplot"), horizontal=TRUE,
    col="gray")
  }

The OHLC (open, high, low, close) and adjusted prices exhibit long-tailed distributions with a right skew; so does the volume.

Since we are interested in evaluating Litecoin’s performance as a cryptocurrency, the final (close) price would be intrinsically of interest; but, more importantly, the “adjusted closing price is considered to be a more technically accurate reflection of the true value” (Bischoff, 2019).

# test skewness by looking at mean and median relationship
mean_ltc <- round(apply(litecoin_df, 2, mean, na.rm = T),0)
median_ltc <- round(apply(litecoin_df, 2, median, na.rm = T),0)
distribution<- data.frame(mean_ltc, median_ltc)
distribution$Skewness <- ifelse(mean_ltc > 2 + median_ltc, "skewed", "normal")
distribution
# Check for exact skewness in LTC.Volume
skewValue <- apply(litecoin_df, 2, skewness, na.rm=T)
skewValue
##     Open     High      Low    Close   Volume Adjusted 
## 1.410603 1.453786 1.357228 1.406507 2.299834 1.406507
# Applying Box-Cox Transformation on skewed variable
trans <- preProcess(data.frame(litecoin_df), method=c("BoxCox"))
trans
## Created from 2628 samples and 6 variables
## 
## Pre-processing:
##   - Box-Cox transformation (6)
##   - ignored (0)
## 
## Lambda estimates for Box-Cox transformation:
## 0.2, 0.2, 0.2, 0.2, 0.1, 0.2
# look at and compare to transformed data
transformed <- predict(trans, data.frame(litecoin_df))
skew_transformed <- apply(transformed, 2, skewness, na.rm=T)
skew_transformed
##        Open        High         Low       Close      Volume    Adjusted 
## -0.06181721 -0.05492312 -0.07210481 -0.06325734 -0.38318043 -0.06325734
new_skew <- data.frame(skewValue, skew_transformed)
new_skew$Skew_Variance <- ifelse(skewValue < skew_transformed, "More skewed", 
                                 "Less skewed")
new_skew

Correlation Matrix

# assign correlation function call to variable
cor_ltc <- cor(litecoin_df) 
# plot the correlation table (matrix)
corrplot(cor_ltc,
         method="color",
         col=colorRampPalette(c("yellow",
                                "white",
                                "orange"))(200),
         addCoef.col = "black", 
         tl.col="black", tl.srt=45, type="lower")

From the correlation matrix, it can be discerned that whereas the OHLC and adjusted prices exhibit multicollinearity at \(r=1,\) their relationships with volume is much less pronounced, where \(0.56 \leq r \leq 0.58.\)

Principal Component Analysis (PCA)

# center, scale the data, and assign to PCA variable
litecoin.pca <- prcomp(litecoin_df, center = TRUE, scale. = TRUE)

# assign to variance explained variable 
var_explained <- round(litecoin.pca$sdev^2/sum((litecoin.pca$sdev)^2)*100, 4)
fviz_eig(litecoin.pca, main="Scree Plot of Six Principal Components",
         xlab="Principal Components", 
         ylab = "Percent Variance Explained",
         barcolor = "grey", barfill = "grey", 
         linecolor = "blue", addlabels=T,
         ggtheme=theme_classic())
Principal Component Percent Variance Percent Change (Delta)
1 89.35
2 10.46 78.9
3 0.11 10.35
4 0.07 0.03
5 0.01 0.06
6 0 0.01

Approximately 89.35% of the variance in the data is explained by the first principal component; thus, the effective dimension is 1. This is supported by and demonstrated in the scree plot and the ensuing table above. The table itself numerically demonstrates the percent variance that is explained by each respective principal component. The scree plot visually depicts “the percentage of the total variance explained by each component” (Kuhn & Johnson, 2016, p. 38).

# create new variable for sole purpose of plotting years on x-axis, not indices
litecoin_plot <- ts(as.vector(litecoin.ts), start=c(2014), frequency = 365)
tsplot(litecoin_plot, main='LTC Adjusted Closing Prices (2014 - 2021)',
       xlab='Year', ylab='Adjusted Price (USD)') # plot the time series

The time series shows a clear trend with several predominant peaks and troughs, at approximately 2017 - 2018 and 2020 - 2021, respectively. To mitigate (offset) the trend, differencing will be performed. Furthermore, the autocorrelation function (ACF) and partial autocorrelation function (PACF) are examined. Whereas ACF “measures the linear predictability of the series at time \(t\), say \(x_t\) using only the value of \(x_s\)” (Shumway & Stoffer, 2019, p. 20), the PACF does the same for a truncated lag length.

Autocorrelation Function (ACF) \[\begin{eqnarray*} \rho(s,t) &=& \frac{\gamma(s,t)}{\sqrt{\gamma(s,s)\gamma(t,t)}} \end{eqnarray*}\]

where \(-1 \leq \rho(s,t) \leq 1.\)

For the sample ACF, we have:

\[\begin{eqnarray*} \rho \text{ } x(h) &=& \frac{\gamma(x(h)}{\gamma x(0)} = \frac{(X_{t+h}-\bar{X}) (X_t-\bar{X})}{\large \sum (X_t-\bar{X})^2}\\ \\ &=&\text{Corr}(X_{t+h,}X_t) \end{eqnarray*}\]

par(mfrow=c(2,1), oma = c(2,2,0,0) + 0.1, mar = c(1,4,3,1) + 0.1)
acf(litecoin_df$Adjusted, lag.max=100, main='Litecoin ACF and PACF for Adjusted Prices')
pacf(litecoin_df$Adjusted, lag.max=100, main='', ylab='PACF')

Whereas the ACF gradually tapers off, the PACF cuts off after lag 1, thereby relegating this to an AR(1) model. So we have the following:

arima(litecoin_df$Adjusted, order=c(1, 0, 0))
## 
## Call:
## arima(x = litecoin_df$Adjusted, order = c(1, 0, 0))
## 
## Coefficients:
##          ar1  intercept
##       0.9960    64.0773
## s.e.  0.0018    30.5462
## 
## sigma^2 estimated as 46.5:  log likelihood = -8776.55,  aic = 17559.09

\[\begin{eqnarray*} (x_t-\mu) &=& \phi_1(x_{t-1}-\mu)+\omega_t.\\ (x_t-64.0773) &=& 0.9960(x_{t-1}-64.0773) + \omega_t\\ x_t &=& 64.0773-(64.0773\times 0.9960) + 0.9960x_{t-1}+\omega_t = 0.256+0.9960x_{t-1}+\omega_t \end{eqnarray*}\]

Smoothing and its Effects

We plot the data for the last six years (November 2014 through November 2021).

Next, we smooth the data by introducing the simple moving average (SMA), and exponential moving average (EMA), respectively, weighting the effects by 30 days (one full month).

chartSeries(litecoin_df, theme = chartTheme("white"))

addSMA(30) # smoothed out moving average by 30 days