2 min read

The Modularity of Kaggle NCAA Predictions

Setup:

We want to look at all the kaggle submissions for 2016. first of all we need to download the data here

Analysis

Now lets run some R! First lets load in the data:

library(dplyr)
library(readr)
library(stringr)
library(ggplot2)
library(reshape2)
rm(list=ls())

# Get list of all files
files <- list.files(path='~/Desktop/predictions/', pattern='*.csv', full.names=T)

# Load all files
allPredictions <- lapply(files, function(f) read.csv(f, stringsAsFactors = F))

Join all the predicitons, remove bad predictions, and calulate correlation

pred = data.frame(lapply(allPredictions, function(x) x[,2]))
# some preditions renedered to factor, remove these
pred.clean = pred[,-which(unlist(lapply(pred,class))=="factor")]
cormat <- round(cor(pred.clean),2)
## Warning in cor(pred.clean): the standard deviation is zero
# some correlations result in NA, remove those
idx = which(is.na(cormat[1,]))
cormat = cormat[-idx,-idx]

Define a distance metric between predictions, and order them by similarity

reorder_cormat <- function(cormat){
# Use correlation between variables as distance
dd <- as.dist((1-cormat)/2)
hc <- hclust(dd)
cormat <-cormat[hc$order, hc$order]
}

Rename the predictions and melt for plotting

cormat <- reorder_cormat(cormat)
colnames(cormat) = 1:dim(cormat)[1]
rownames(cormat) = 1:dim(cormat)[1]


melted_cormat <- melt(cormat, na.rm = TRUE)

Create the heatmap

# Create a ggheatmap
covariance_plot = ggplot(melted_cormat, aes(Var2, Var1, fill = value)) +
geom_raster() + scale_fill_gradient2(low = "blue",
                     high = "red",
                     mid = "white",
                     midpoint = 0,
                     limit = c(-1,1),
                     space = "Lab",
                     name="Pearson\nCorrelation") +
                     theme_minimal() +
                     theme(
                       axis.title.x = element_blank(),
                       axis.title.y = element_blank())
covariance_plot