Tags

, , , , , , ,

In this tutorial I tried with R to prepare a dataset in different formats which can be used for various predictive analytics purposes.

1. Loading Essential Libraries

# Load the library
library(magrittr)
library(rvest)
library(ggplot2)
library(gtable)
library(grid)
library(taRifx)
library(xtable)
library(pander)
library(stringr)
library(plyr)

2. Setting Up the Data Frame

# setting URL to fetch data from
URL <- "http://en.wikipedia.org/wiki/History_of_rugby_union_matches_between_England_and_Wales"
# Fetching HTML data from website
rugbyHTML <- html(URL)

# Fetching table data from the website
rugbyData <- rugbyHTML %>%
  html_nodes("table.wikitable") %>% .[[3]] %>%
  html_table

# Viewing data
head(rugbyData)
##   No.             Date                       Venue   Score  Winner
## 1 126  6 February 2015 Millennium Stadium, Cardiff 16 – 21 England
## 2 125     9 March 2014  Twickenham Stadium, London 29 – 18 England
## 3 124    16 March 2013 Millennium Stadium, Cardiff  30 – 3   Wales
## 4 123 25 February 2012  Twickenham Stadium, London 12 – 19   Wales
## 5 122   13 August 2011 Millennium Stadium, Cardiff  19 – 9   Wales
## 6 121    6 August 2011          Twickenham, London 23 – 19 England
##                         Competition Match report
## 1                  2015 Six Nations             
## 2                  2014 Six Nations             
## 3                  2013 Six Nations          BBC
## 4                  2012 Six Nations          BBC
## 5 2011 Rugby World Cup warm up test          BBC
## 6 2011 Rugby World Cup warm up test          BBC
# Converting Dates format
rugbyData$Date  <- as.Date(rugbyData$Date, "%d %b %Y")

# Removing irrelavent column for this tutorial
rugbyData <- rugbyData[, -c(1,6,7)]

# Removing first row as 2015's result (just announced)
# We will remove it for testing purposes and keep a copy of it
rugbyData0 <- rugbyData
rugbyData0 <- rugbyData0[-1, ]

# row.names inserted by default in the above operation,
# so lets remove it
row.names(rugbyData0) <- NULL

# Viewing final table
head(rugbyData0)
##         Date                       Venue   Score  Winner
## 1 2014-03-09  Twickenham Stadium, London 29 – 18 England
## 2 2013-03-16 Millennium Stadium, Cardiff  30 – 3   Wales
## 3 2012-02-25  Twickenham Stadium, London 12 – 19   Wales
## 4 2011-08-13 Millennium Stadium, Cardiff  19 – 9   Wales
## 5 2011-08-06          Twickenham, London 23 – 19 England
## 6 2011-02-04 Millennium Stadium, Cardiff 19 – 26 England
# Ok. we still have scores in "xx - yy" format
# So lets grep it to "xx" and "yy" in different columns

matches <- regmatches(rugbyData0$Score, gregexpr("[[:digit:]]+", rugbyData0$Score))
#matches <- as.data.frame(as.numeric(unlist(matches)))

matches <- as.data.frame(matches)
colnames(matches) <- 1:125
matches1 <- matches[-c(2,3,4), ]
matches2 <- matches[-c(1,3,4), ]

tail(matches1)
##    1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## 1 29 30 12 19 23 19 30 23 19 62 27 47 11 31 28  9  9 50 15 46 32 60 13 21
##   25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
## 1  9 15 10 24  6 34 12  3  3 19 21 24 15 13 17 21  9 27  6 14  9 20 16 25
##   49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 1  3 22 13 30 11 34  6 14  6  6  0  6 14  5  3  0  3  3  9  3  6 23  5  9
##   73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
## 1  3  6  3 14  4  0  3  0  3 12 11  3  8  8 11  3 12  9  7 28 18 19 10  0
##   97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
## 1  8 15 11   8  18  22   3  25  14  21   8  13   3  26  14  11  25   6  24
##   116 117 118 119 120 121 122 123 124 125
## 1  12  17   3   0   0   0   1   1   0   8
tail(matches2)
##    1 2  3 4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 2 18 3 19 9 19 26 17 15 26  5 18 13  9 21 17 43 26 10 44 12 31 26 34 15 23
##   26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
## 2  8  9  0 25  6  9 11 16 12 18 15 24 13  7 19  8  3  9  9 21  4 12  9 12
##   50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
## 2  6 17  9 11 21 11  3  6 13  0  3  6  0  3  3  8  0  6  8  8  5 11  3  3
##   74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
## 2  9  0  8  3  0  3  9  7  5 11 11  3 10  9  3  6 17  3  6  3  5  9 12  0
##   98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
## 2 11  6   0  28   0  16   0  14   5   9   0  13   3   7   0   0  14   3
##   116 117 118 119 120 121 122 123 124 125
## 2  11   0   7   1   0   5   4   5   0  30
# replacing last 5 elements in the tail with actual score
matches1[, 121:125] <- c(5,4,5,0,30)
matches2[, 121:125] <- c(3,7,3,10,0)
t(matches1)
rugbyData0$Score1  <-  t(matches1)
rugbyData0$Score2  <-  t(matches2)

# remove "Score" column
rugbyData0  <- rugbyData0[, -3]

# Checking Mode and Class of the Data Frame
sapply(rugbyData0, mode)
sapply(rugbyData0, class)
# OOOO La La... a lot of mess... so let's clean it up 

rugbyData0 <- transform(rugbyData0, Score1 = as.numeric(Score1))
rugbyData0 <- transform(rugbyData0, Score2 = as.numeric(Score2))
### Separating Winner from Looser

rugbyData1 <- rugbyData0
rugbyData1$Winner1 <- 0
rugbyData1$Looser1 <- 0
rugbyData1$Draw1 <- 0

for (n in 1:nrow(rugbyData1)){
  if (rugbyData1$Score1[n] - rugbyData1$Score2[n] > 0 ) {
    rugbyData1$Winner1[n] <- rugbyData1$Score1[n]
    rugbyData1$Looser1[n] <- rugbyData1$Score2[n]
  }
  else if (rugbyData1$Score1[n] - rugbyData1$Score2[n] == 0){
    rugbyData1$Draw1[n] <- rugbyData1$Score1[n]     
  }
  else {
    rugbyData1$Winner1[n] <- rugbyData1$Score2[n]
    rugbyData1$Looser1[n] <- rugbyData1$Score1[n]
  }
}

colnames(rugbyData1)[6:8] <- c("WinnerScore", "LooserScore", "DrawScore")
rugbyData1$Score1 <- NULL
rugbyData1$Score2 <- NULL
rugbyData1$WinnerScore <- rugbyData1$WinnerScore + rugbyData1$DrawScore
rugbyData1$LooserScore <- rugbyData1$LooserScore + rugbyData1$DrawScore

# So our data frame requires to be converted into workable dataset
rugbyData1$EnglandScore <- NULL
rugbyData1$WalesScore <- NULL

for (i in 1:nrow(rugbyData1)){
  if (rugbyData1$Winner[i] == "England"){
    rugbyData1$EnglandScore[i] <- rugbyData1$WinnerScore[i]
  } else {
    rugbyData1$EnglandScore[i] <- rugbyData1$LooserScore[i]    
  }
}

for (i in 1:nrow(rugbyData1)){
  if (rugbyData1$Winner[i] == "Wales"){
    rugbyData1$WalesScore[i] <- rugbyData1$WinnerScore[i]
  } else {
    rugbyData1$WalesScore[i] <- rugbyData1$LooserScore[i]    
  }
}

for (i in 1:nrow(rugbyData1)){
  if (rugbyData1$Winner[i] == "England"){
    rugbyData1$EnglandScore[i] <- rugbyData1$WinnerScore[i]
  } else {
    rugbyData1$EnglandScore[i] <- rugbyData1$LooserScore[i]    
  }
}

for (i in 1:nrow(rugbyData1)){
  if (rugbyData1$Winner[i] == "draw"){
    rugbyData1$EnglandScore[i] <- rugbyData1$WinnerScore[i]
    rugbyData1$WalesScore[i] <- rugbyData1$WinnerScore[i]    
  } else {
    rugbyData1$EnglandScore[i] <- rugbyData1$EnglandScore[i] + rugbyData1$DrawScore[i]
    rugbyData1$WalesScore[i] <- rugbyData1$WalesScore[i] + rugbyData1$DrawScore[i]    
  }
} 


# Separating Venue as whether "Home" or "Away" or "Other"
England <- data.frame( "England", c("London", "Leeds", "Birkenhead", "Gloucester", "Leicester", 
                                    "Richmond", "Yorkshire", "Bristol"))
colnames(England) <- c("Venue", "GameVenue")

Wales <- data.frame( "Wales", c("Cardiff", "Swansea", "Newport", "Llanelli"))
colnames(Wales) <- c("Venue", "GameVenue")

Other <- data.frame( "Other", c("Australia"))
colnames(Other) <- c("Venue", "GameVenue")

Venue <- merge(England, merge(Wales, Other, by= c("Venue", "GameVenue"), all.x=TRUE, all.y=TRUE), 
               by = c("Venue", "GameVenue"), all.x = TRUE, all.y = TRUE)

# Extracting last word from each row in a column
rugbyData1$GameVenue <- NULL
for (i in 1:nrow(rugbyData1)){
rugbyData1$GameVenue[i] <- tail(strsplit(rugbyData1$Venue[i], split = " ")[[1]], 1)  
}

# So converting all into one Final Data Set
rugbyDataFinal <- join(rugbyData1, Venue, by = 'GameVenue')
rugbyDataFinal$Venue <- NULL
rugbyDataFinal$WinnerScore <- NULL
rugbyDataFinal$LooserScore <- NULL
rugbyDataFinal$DrawScore <- NULL
rugbyDataFinal$GameVenue <- NULL
head(rugbyDataFinal)
##         Date  Winner EnglandScore WalesScore   Venue
## 1 2014-03-09 England           29         18 England
## 2 2013-03-16   Wales            3         30   Wales
## 3 2012-02-25   Wales           12         19 England
## 4 2011-08-13   Wales            9         19   Wales
## 5 2011-08-06 England           23         19 England
## 6 2011-02-04 England           26         19   Wales
# Wow... this dataset can be used for many statistical purposes
# We do like in one of the tutorials on internet
# Last Part of Data Cleaning and Converting into Workable
# We need whether winner won in "Home" venue or "Away"

rugbyDataFinal$WinnerVenue <- NULL
for (i in 1:nrow(rugbyDataFinal)){
if (rugbyDataFinal$Winner[i] == rugbyDataFinal$Venue[i]) {
  rugbyDataFinal$WinnerVenue[i] <- "Home"  
  } else {
  rugbyDataFinal$WinnerVenue[i] <- "Away"
  }
}

rugbyData1$GamesVenue <- rugbyDataFinal$Venue
rugbyData1$WinnersVenue <- rugbyDataFinal$WinnerVenue

# Saving into rds
saveRDS(rugbyData1, file = "rugbyData.rds", refhook = NULL)
saveRDS(rugbyDataFinal, file = "rugbyDataFinal.rds", refhook = NULL)
# Saving into RData
save(rugbyData1, file = "rugbyData.RData")
save(rugbyDataFinal, file = "rugbyDataFinal.RData")
# Saving into csv
write.csv(rugbyData1, file = "rugbyData.csv")
write.csv(rugbyDataFinal, file = "rugbyDataFinal.csv")

# OK all the files are saved into default directory
# We now can free up the R Environment and memory and reload one of the files saved

rm(list=ls())
# Loading Files from working directory, my preference is "rds" files
RugbyDataFinal <- readRDS("rugbyDataFinal.rds", refhook = NULL)
RugbyData <- readRDS("rugbyData.rds", refhook = NULL)

3. Plotting the Data

# Let us plot the data first
# In the following plot we see red dots represent Wales's Win 
# and blue, represents England's Win
# Further more the blue line is smoothing line for England 

p <- ggplot(RugbyData, aes(x = Date, y = WinnerScore))
p + geom_point(colour = "blue", size = 3, shape=20) +
  geom_point(data = RugbyData, aes(x = Date, y = LooserScore), colour = 'red', size = 3, shape=20)+
  theme(axis.text.x = element_text(angle=90, size=11, vjust=0.5, face="bold", color="black"),
        axis.text.y = element_text(size=11, vjust=0.5, face="bold", color="black"),
        axis.title.x = element_text(size=15, color="forestgreen", vjust=0.35, face = "bold"),
        axis.title.y = element_text(size=13, color="blue" , vjust=0.35, face = "bold")) +
  stat_smooth(method = "loess", se = FALSE, fill="blue", colour="blue", size=1) +
  labs(list(x = "Year", y = "Winner's Score", 
            title = ("Winner's(blue, with smoothing line) \nAnd Looser's (red) Scores - Yearly")))
unnamed-chunk-9-1
# But.... YYYuuuukkkkk!!!
# This plot looks like missing many things and requires some make-up

4. Make-up

# Guys, delibrately I am not showing my R-code over here
# You have to "LIKE" my post, "Facebook page" and "Reply to the post"... 
# Then I will email you the code for that graph

## ggplot - plotting on dual axis (both the Y axis)

unnamed-chunk-11-1
# So in the above plot we cannot see the significant differences between two teams.

4. Predict: who will win this year

# If we look at historical data about who has won on the previous encounters,
# we see that Wales have a slight edge but nothing statistically significant.
# Here is the result

# If we look at historical data about who has won on the previous encounters,
# we see that Wales have a slight edge but nothing statistically significant.
# Here is the result

Wales_Win <- sum(RugbyData$Winner == "Wales")
England_Win <- sum(RugbyData$Winner == "England")
Draw <- sum(RugbyData$Winner == "draw")

test <- data.frame("Wales Wins" = Wales_Win, "England Wins" = England_Win, "Draw" = Draw)
cnames<- c("Wales Wins", "England Wins", "Draw")
colnames(test) <- cnames
pander(test)

Wales            Wins England         Wins Draw


 56            57              12  

# Clearly there are no reasons that we can say that whether England or Wales 
# have won more games, over the year they are almost the same
# Although one might say that England has won 1 game more more than Wales's
# wins. But 1 in 113 (excluding 12 Draws) doesn't make much differance.

# Let us perform a T-test to have a basic statistical idea
ttest <- t.test(RugbyData$WinnerScore, RugbyData$LooserScore)

# Results of T-Test
ttest
## 
##  Welch Two Sample t-test
## 
## data:  RugbyData$WinnerScore and RugbyData$LooserScore
## t = 8.3961, df = 188.195, p-value = 1.094e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   7.687248 12.408752
## sample estimates:
## mean of x mean of y 
##    17.352     7.304
# Clearly two means are significantly different from each other and from zero.

# Density plot of p-values in T-test
plot(density(replicate(100, ttest$p.value)), 
     main = "Plot of p-values", col="red", lwd=2)
unnamed-chunk-14-1

# Preparing a small dataset for logistic regression
data1  <- RugbyData[, c(1,3,4,5,11)]
summary(data1)
##       Date               Winner           WinnerScore     LooserScore    
##  Min.   :1881-02-19   Length:125         Min.   : 0.00   Min.   : 0.000  
##  1st Qu.:1920-01-17   Class :character   1st Qu.: 9.00   1st Qu.: 3.000  
##  Median :1958-01-18   Mode  :character   Median :14.00   Median : 6.000  
##  Mean   :1953-03-31                      Mean   :17.35   Mean   : 7.304  
##  3rd Qu.:1988-02-06                      3rd Qu.:24.00   3rd Qu.:11.000  
##  Max.   :2014-03-09                      Max.   :62.00   Max.   :31.000  
##  WinnersVenue      
##  Length:125        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
str(data1)
## 'data.frame':    125 obs. of  5 variables:
##  $ Date        : Date, format: "2014-03-09" "2013-03-16" ...
##  $ Winner      : chr  "England" "Wales" "Wales" "Wales" ...
##  $ WinnerScore : num  29 30 19 19 23 26 30 23 26 62 ...
##  $ LooserScore : num  18 3 12 9 19 19 17 15 19 5 ...
##  $ WinnersVenue: chr  "Home" "Home" "Away" "Home" ...
data1[, 'Winner'] <- as.factor(data1[, 'Winner'])
data1[, 'WinnersVenue'] <- as.factor(data1[, 'WinnersVenue'])
str(data1)
## 'data.frame':    125 obs. of  5 variables:
##  $ Date        : Date, format: "2014-03-09" "2013-03-16" ...
##  $ Winner      : Factor w/ 3 levels "draw","England",..: 2 3 3 3 2 2 2 3 3 2 ...
##  $ WinnerScore : num  29 30 19 19 23 26 30 23 26 62 ...
##  $ LooserScore : num  18 3 12 9 19 19 17 15 19 5 ...
##  $ WinnersVenue: Factor w/ 2 levels "Away","Home": 2 2 1 2 2 1 2 2 1 2 ...
data1$EnglandWins <- NULL
for(i in 1:nrow(data1)){
  if (data1$Winner[i] == "England"){
    data1$EnglandWins[i] <- 1
  } else{
    data1$EnglandWins[i] <- 0
  }
}
# Done....

saveRDS(data1, file = "rugbyData1.rds", refhook = NULL)

# View Dataset
summary(data1)
##       Date                Winner    WinnerScore     LooserScore    
##  Min.   :1881-02-19   draw   :12   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.:1920-01-17   England:57   1st Qu.: 9.00   1st Qu.: 3.000  
##  Median :1958-01-18   Wales  :56   Median :14.00   Median : 6.000  
##  Mean   :1953-03-31                Mean   :17.35   Mean   : 7.304  
##  3rd Qu.:1988-02-06                3rd Qu.:24.00   3rd Qu.:11.000  
##  Max.   :2014-03-09                Max.   :62.00   Max.   :31.000  
##  WinnersVenue  EnglandWins   
##  Away:53      Min.   :0.000  
##  Home:72      1st Qu.:0.000  
##               Median :0.000  
##               Mean   :0.456  
##               3rd Qu.:1.000  
##               Max.   :1.000
# Storing "Date", "Winner", "EnglandScore", "WalesScore", "GamesVenue", "WinnersVenue" 
# in a separate data frame

data2  <- RugbyData[, c(1,3,7,8,10,11)]
summary(data2)
##       Date               Winner           EnglandScore     WalesScore  
##  Min.   :1881-02-19   Length:125         Min.   : 0.00   Min.   : 0.0  
##  1st Qu.:1920-01-17   Class :character   1st Qu.: 4.00   1st Qu.: 5.0  
##  Median :1958-01-18   Mode  :character   Median : 9.00   Median :10.0  
##  Mean   :1953-03-31                      Mean   :13.06   Mean   :11.6  
##  3rd Qu.:1988-02-06                      3rd Qu.:17.00   3rd Qu.:18.0  
##  Max.   :2014-03-09                      Max.   :62.00   Max.   :34.0  
##    GamesVenue WinnersVenue      
##  England:63   Length:125        
##  Wales  :60   Class :character  
##  Other  : 2   Mode  :character  
##                                 
##                                 
## 
str(data2)
## 'data.frame':    125 obs. of  6 variables:
##  $ Date        : Date, format: "2014-03-09" "2013-03-16" ...
##  $ Winner      : chr  "England" "Wales" "Wales" "Wales" ...
##  $ EnglandScore: num  29 3 12 9 23 26 30 15 19 62 ...
##  $ WalesScore  : num  18 30 19 19 19 19 17 23 26 5 ...
##  $ GamesVenue  : Factor w/ 3 levels "England","Wales",..: 1 2 1 2 1 2 1 2 1 1 ...
##  $ WinnersVenue: chr  "Home" "Home" "Away" "Home" ...
data2[, 'Winner'] <- as.factor(data2[, 'Winner'])
data2[, 'WinnersVenue'] <- as.factor(data2[, 'WinnersVenue'])
str(data2)
## 'data.frame':    125 obs. of  6 variables:
##  $ Date        : Date, format: "2014-03-09" "2013-03-16" ...
##  $ Winner      : Factor w/ 3 levels "draw","England",..: 2 3 3 3 2 2 2 3 3 2 ...
##  $ EnglandScore: num  29 3 12 9 23 26 30 15 19 62 ...
##  $ WalesScore  : num  18 30 19 19 19 19 17 23 26 5 ...
##  $ GamesVenue  : Factor w/ 3 levels "England","Wales",..: 1 2 1 2 1 2 1 2 1 1 ...
##  $ WinnersVenue: Factor w/ 2 levels "Away","Home": 2 2 1 2 2 1 2 2 1 2 ...
data2$HomeVenue <- NULL
for(i in 1:nrow(data2)){
  if (data2$WinnersVenue[i] == "Home"){
    data2$HomeVenue[i] <- 1
  } else{
    data2$HomeVenue[i] <- 0
  }
}

plot(data2$WinnersVenue~data2$Winner, col=c("red", "green"), 
     xlab="Winner Team", ylab="Venue of Game", main="Winning w.r.t. Home or Away Venue")
unnamed-chunk-16-1


for(i in 1:nrow(data2)){
  if (data2$WinnersVenue[i] == "Home"){
    data2$HomeVenue[i] <- 1
  } else{
    data2$HomeVenue[i] <- 0
  }
}

data2$Home <- NULL
for(i in 1:nrow(data2)){
  if (data2$WinnersVenue[i] == "Home" && data2$Winner[i] == "England"){
    data2$Home[i] <- "HomeEngland"
  } else if (data2$WinnersVenue[i] == "Home" && data2$Winner[i] == "Wales"){
    data2$Home[i] <- "HomeWales"
  } else {
    data2$Home[i] <- "HomeOther"    
  }
}
data2[, 'Home'] <- as.factor(data2[, 'Home'])

# Saving into rds
saveRDS(data2, file = "rugbyData2.rds", refhook = NULL)
rm(list=ls())

Data1 <- readRDS("rugbyData1.rds", refhook = NULL)

head(Data1)
##         Date  Winner WinnerScore LooserScore WinnersVenue EnglandWins
## 1 2014-03-09 England          29          18         Home           1
## 2 2013-03-16   Wales          30           3         Home           0
## 3 2012-02-25   Wales          19          12         Away           0
## 4 2011-08-13   Wales          19           9         Home           0
## 5 2011-08-06 England          23          19         Home           1
## 6 2011-02-04 England          26          19         Away           1
Data2 <- readRDS("rugbyData2.rds", refhook = NULL)
head(Data2)
##         Date  Winner EnglandScore WalesScore GamesVenue WinnersVenue
## 1 2014-03-09 England           29         18    England         Home
## 2 2013-03-16   Wales            3         30      Wales         Home
## 3 2012-02-25   Wales           12         19    England         Away
## 4 2011-08-13   Wales            9         19      Wales         Home
## 5 2011-08-06 England           23         19    England         Home
## 6 2011-02-04 England           26         19      Wales         Away
##   HomeVenue        Home
## 1         1 HomeEngland
## 2         1   HomeWales
## 3         0   HomeOther
## 4         1   HomeWales
## 5         1 HomeEngland
## 6         0   HomeOther
# Let us work on prediction now 

COMING SOON…..

Part 2

Advertisements