Tags

, , , , , , ,

History of Rugby Union Matches Between England and Wales

1. Loading Essential Libraries

# Load the library
library(magrittr)
library(rvest)
library(ggplot2)
library(gtable)
library(grid)
library(taRifx)
library(xtable)
library(pander)
library(stringr)
library(plyr)

2. Setting Up the Data Frame

# setting URL to fetch data from
URL <- "http://en.wikipedia.org/wiki/
History_of_rugby_union_matches_between_England_and_Wales"
# Fetching HTML data from website
rugbyHTML <- html(URL)

# Fetching table data from the website
rugbyData %
html_nodes(“table.wikitable”) %>% .[[3]] %>%
html_table

# Viewing data
head(rugbyData)

## No. Date Venue Score Winner
## 1 126 6 February 2015 Millennium Stadium, Cardiff 16 – 21 England
## 2 125 9 March 2014 Twickenham Stadium, London 29 – 18 England
## 3 124 16 March 2013 Millennium Stadium, Cardiff 30 – 3 Wales
## 4 123 25 February 2012 Twickenham Stadium, London 12 – 19 Wales
## 5 122 13 August 2011 Millennium Stadium, Cardiff 19 – 9 Wales
## 6 121 6 August 2011 Twickenham, London 23 – 19 England
## Competition Match report
## 1 2015 Six Nations
## 2 2014 Six Nations
## 3 2013 Six Nations BBC
## 4 2012 Six Nations BBC
## 5 2011 Rugby World Cup warm up test BBC
## 6 2011 Rugby World Cup warm up test BBC

# Converting Dates format
rugbyData$Date <- as.Date(rugbyData$Date, "%d %b %Y")

# Removing irrelavent column for this tutorial
rugbyData <- rugbyData[, -c(1,6,7)]

# Removing first row as 2015's result (just announced)
# We will remove it for testing purposes and keep a copy of it
rugbyData0 <- rugbyData
rugbyData0 <- rugbyData0[-1, ]

# row.names inserted by default in the above operation,
# so lets remove it
row.names(rugbyData0) <- NULL

# Viewing final table
head(rugbyData0)

## Date Venue Score Winner
## 1 2014-03-09 Twickenham Stadium, London 29 – 18 England
## 2 2013-03-16 Millennium Stadium, Cardiff 30 – 3 Wales
## 3 2012-02-25 Twickenham Stadium, London 12 – 19 Wales
## 4 2011-08-13 Millennium Stadium, Cardiff 19 – 9 Wales
## 5 2011-08-06 Twickenham, London 23 – 19 England
## 6 2011-02-04 Millennium Stadium, Cardiff 19 – 26 England

# Ok. we still have scores in "xx – yy" format
# So lets grep it to "xx" and "yy" in different columns

matches <- regmatches(rugbyData0$Score, gregexpr("[[:digit:]]+", rugbyData0$Score))
#matches <- as.data.frame(as.numeric(unlist(matches)))

matches <- as.data.frame(matches)
colnames(matches) <- 1:125
matches1 <- matches[-c(2,3,4), ]
matches2 <- matches[-c(1,3,4), ]

tail(matches1)

## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## 1 29 30 12 19 23 19 30 23 19 62 27 47 11 31 28 9 9 50 15 46 32 60 13 21
## 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
## 1 9 15 10 24 6 34 12 3 3 19 21 24 15 13 17 21 9 27 6 14 9 20 16 25
## 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 1 3 22 13 30 11 34 6 14 6 6 0 6 14 5 3 0 3 3 9 3 6 23 5 9
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
## 1 3 6 3 14 4 0 3 0 3 12 11 3 8 8 11 3 12 9 7 28 18 19 10 0
## 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
## 1 8 15 11 8 18 22 3 25 14 21 8 13 3 26 14 11 25 6 24
## 116 117 118 119 120 121 122 123 124 125
## 1 12 17 3 0 0 0 1 1 0 8

tail(matches2)

## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 2 18 3 19 9 19 26 17 15 26 5 18 13 9 21 17 43 26 10 44 12 31 26 34 15 23
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
## 2 8 9 0 25 6 9 11 16 12 18 15 24 13 7 19 8 3 9 9 21 4 12 9 12
## 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
## 2 6 17 9 11 21 11 3 6 13 0 3 6 0 3 3 8 0 6 8 8 5 11 3 3
## 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
## 2 9 0 8 3 0 3 9 7 5 11 11 3 10 9 3 6 17 3 6 3 5 9 12 0
## 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
## 2 11 6 0 28 0 16 0 14 5 9 0 13 3 7 0 0 14 3
## 116 117 118 119 120 121 122 123 124 125
## 2 11 0 7 1 0 5 4 5 0 30

# replacing last 5 elements in the tail with actual score
matches1[, 121:125] <- c(5,4,5,0,30)
matches2[, 121:125] <- c(3,7,3,10,0)

t(matches1)
rugbyData0$Score1 <- t(matches1)
rugbyData0$Score2 <- t(matches2)

# remove "Score" column
rugbyData0 <- rugbyData0[, -3]

# Checking Mode and Class of the Data Frame
sapply(rugbyData0, mode)
sapply(rugbyData0, class)
# OOOO La La… a lot of mess… so let's clean it up

rugbyData0 <- transform(rugbyData0, Score1 = as.numeric(Score1))
rugbyData0 <- transform(rugbyData0, Score2 = as.numeric(Score2))

### Separating Winner from Looser

rugbyData1 <- rugbyData0
rugbyData1$Winner1 <- 0
rugbyData1$Looser1 <- 0
rugbyData1$Draw1 0 ) {
rugbyData1$Winner1[n] <- rugbyData1$Score1[n]
rugbyData1$Looser1[n] <- rugbyData1$Score2[n]
}
else if (rugbyData1$Score1[n] – rugbyData1$Score2[n] == 0){
rugbyData1$Draw1[n] <- rugbyData1$Score1[n]
}
else {
rugbyData1$Winner1[n] <- rugbyData1$Score2[n]
rugbyData1$Looser1[n] <- rugbyData1$Score1[n]
}
}

colnames(rugbyData1)[6:8] <- c("WinnerScore", "LooserScore", "DrawScore")
rugbyData1$Score1 <- NULL
rugbyData1$Score2 <- NULL
rugbyData1$WinnerScore <- rugbyData1$WinnerScore + rugbyData1$DrawScore
rugbyData1$LooserScore <- rugbyData1$LooserScore + rugbyData1$DrawScore

# So our data frame requires to be converted into workable dataset

rugbyData1$EnglandScore <- NULL
rugbyData1$WalesScore <- NULL

for (i in 1:nrow(rugbyData1)){
if (rugbyData1$Winner[i] == "England"){
rugbyData1$EnglandScore[i] <- rugbyData1$WinnerScore[i]
} else {
rugbyData1$EnglandScore[i] <- rugbyData1$LooserScore[i]
}
}

for (i in 1:nrow(rugbyData1)){
if (rugbyData1$Winner[i] == "Wales"){
rugbyData1$WalesScore[i] <- rugbyData1$WinnerScore[i]
} else {
rugbyData1$WalesScore[i] <- rugbyData1$LooserScore[i]
}
}

for (i in 1:nrow(rugbyData1)){
if (rugbyData1$Winner[i] == "England"){
rugbyData1$EnglandScore[i] <- rugbyData1$WinnerScore[i]
} else {
rugbyData1$EnglandScore[i] <- rugbyData1$LooserScore[i]
}
}

for (i in 1:nrow(rugbyData1)){
if (rugbyData1$Winner[i] == "draw"){
rugbyData1$EnglandScore[i] <- rugbyData1$WinnerScore[i]
rugbyData1$WalesScore[i] <- rugbyData1$WinnerScore[i]
} else {
rugbyData1$EnglandScore[i] <- rugbyData1$EnglandScore[i] + rugbyData1$DrawScore[i]
rugbyData1$WalesScore[i] <- rugbyData1$WalesScore[i] + rugbyData1$DrawScore[i]
}
}

# Separating Venue as whether "Home" or "Away" or "Other"
England <- data.frame( "England", c("London", "Leeds", "Birkenhead", "Gloucester", "Leicester",
"Richmond", "Yorkshire", "Bristol"))
colnames(England) <- c("Venue", "GameVenue")

Wales <- data.frame( "Wales", c("Cardiff", "Swansea", "Newport", "Llanelli"))
colnames(Wales) <- c("Venue", "GameVenue")

Other <- data.frame( "Other", c("Australia"))
colnames(Other) <- c("Venue", "GameVenue")

Venue <- merge(England, merge(Wales, Other, by= c("Venue", "GameVenue"), all.x=TRUE, all.y=TRUE),
by = c("Venue", "GameVenue"), all.x = TRUE, all.y = TRUE)

# Extracting last word from each row in a column
rugbyData1$GameVenue <- NULL
for (i in 1:nrow(rugbyData1)){
rugbyData1$GameVenue[i] <- tail(strsplit(rugbyData1$Venue[i], split = " ")[[1]], 1)
}

# So converting all into one Final Data Set
rugbyDataFinal <- join(rugbyData1, Venue, by = 'GameVenue')
rugbyDataFinal$Venue <- NULL
rugbyDataFinal$WinnerScore <- NULL
rugbyDataFinal$LooserScore <- NULL
rugbyDataFinal$DrawScore <- NULL
rugbyDataFinal$GameVenue <- NULL

head(rugbyDataFinal)

## Date Winner EnglandScore WalesScore Venue
## 1 2014-03-09 England 29 18 England
## 2 2013-03-16 Wales 3 30 Wales
## 3 2012-02-25 Wales 12 19 England
## 4 2011-08-13 Wales 9 19 Wales
## 5 2011-08-06 England 23 19 England
## 6 2011-02-04 England 26 19 Wales

# Wow… this dataset can be used for many statistical purposes

# We do like in one of the tutorials on internet
# Last Part of Data Cleaning and Converting into Workable
# We need whether winner won in "Home" venue or "Away"

rugbyDataFinal$WinnerVenue <- NULL
for (i in 1:nrow(rugbyDataFinal)){
if (rugbyDataFinal$Winner[i] == rugbyDataFinal$Venue[i]) {
rugbyDataFinal$WinnerVenue[i] <- "Home"
} else {
rugbyDataFinal$WinnerVenue[i] <- "Away"
}
}

rugbyData1$GamesVenue <- rugbyDataFinal$Venue
rugbyData1$WinnersVenue <- rugbyDataFinal$WinnerVenue

# Saving into rds
saveRDS(rugbyData1, file = "rugbyData.rds", refhook = NULL)
saveRDS(rugbyDataFinal, file = "rugbyDataFinal.rds", refhook = NULL)
# Saving into RData
save(rugbyData1, file = "rugbyData.RData")
save(rugbyDataFinal, file = "rugbyDataFinal.RData")
# Saving into csv
write.csv(rugbyData1, file = "rugbyData.csv")
write.csv(rugbyDataFinal, file = "rugbyDataFinal.csv")

# OK all the files are saved into default directory
# We now can free up the R Environment and memory and reload one of the files saved

rm(list=ls())

# Loading Files from working directory, my preference is "rds" files
RugbyDataFinal <- readRDS("rugbyDataFinal.rds", refhook = NULL)
RugbyData <- readRDS("rugbyData.rds", refhook = NULL)

3. Plotting the Data

# Let us plot the data first
# In the following plot we see red dots represent Wales's Win
# and blue, represents England's Win
# Further more the blue line is smoothing line for England

p <- ggplot(RugbyData, aes(x = Date, y = WinnerScore))
p + geom_point(colour = "blue", size = 3, shape=20) +
geom_point(data = RugbyData, aes(x = Date, y = LooserScore), colour = 'red', size = 3, shape=20)+
theme(axis.text.x = element_text(angle=90, size=11, vjust=0.5, face="bold", color="black"),
axis.text.y = element_text(size=11, vjust=0.5, face="bold", color="black"),
axis.title.x = element_text(size=15, color="forestgreen", vjust=0.35, face = "bold"),
axis.title.y = element_text(size=13, color="blue" , vjust=0.35, face = "bold")) +
stat_smooth(method = "loess", se = FALSE, fill="blue", colour="blue", size=1) +
labs(list(x = "Year", y = "Winner's Score",
title = ("Winner's(blue, with smoothing line) \nAnd Looser's (red) Scores – Yearly")))

# But…. YYYuuuukkkkk!!!
# This plot looks like missing many things and requires some make-up

4. Make-up

# Guys, delibrately I am not showing my R-code over here
# You have to "LIKE" my post, "Facebook page" and "Reply to the post"…
# Then I will email you the code for that graph

## ggplot – plotting on dual axis (both the Y axis)

# So in the above plot we cannot see the significant differences between two teams.

4. Predict: who will win this year

# If we look at historical data about who has won on the previous encounters,
# we see that Wales have a slight edge but nothing statistically significant.
# Here is the result

# If we look at historical data about who has won on the previous encounters,
# we see that Wales have a slight edge but nothing statistically significant.
# Here is the result

Wales_Win <- sum(RugbyData$Winner == "Wales")
England_Win <- sum(RugbyData$Winner == "England")
Draw <- sum(RugbyData$Winner == "draw")

test <- data.frame("Wales Wins" = Wales_Win, "England Wins" = England_Win, "Draw" = Draw)
cnames<- c("Wales Wins", "England Wins", "Draw")
colnames(test) <- cnames

pander(test)

Wales Wins England Wins Draw
56 57 12

# Clearly there are no reasons that we can say that whether England or Wales
# have won more games, over the year they are almost the same
# Although one might say that England has won 1 game more more than Wales's
# wins. But 1 in 113 (excluding 12 Draws) doesn't make much differance.

# Let us perform a T-test to have a basic statistical idea
ttest <- t.test(RugbyData$WinnerScore, RugbyData$LooserScore)

# Results of T-Test
ttest

##
## Welch Two Sample t-test
##
## data: RugbyData$WinnerScore and RugbyData$LooserScore
## t = 8.3961, df = 188.195, p-value = 1.094e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 7.687248 12.408752
## sample estimates:
## mean of x mean of y
## 17.352 7.304

# Clearly two means are significantly different from each other and from zero.

# Density plot of p-values in T-test
plot(density(replicate(100, ttest$p.value)),
main = "Plot of p-values", col="red", lwd=2)

# Preparing a small dataset for logistic regression
data1 <- RugbyData[, c(1,3,4,5,11)]
summary(data1)

## Date Winner WinnerScore LooserScore
## Min. :1881-02-19 Length:125 Min. : 0.00 Min. : 0.000
## 1st Qu.:1920-01-17 Class :character 1st Qu.: 9.00 1st Qu.: 3.000
## Median :1958-01-18 Mode :character Median :14.00 Median : 6.000
## Mean :1953-03-31 Mean :17.35 Mean : 7.304
## 3rd Qu.:1988-02-06 3rd Qu.:24.00 3rd Qu.:11.000
## Max. :2014-03-09 Max. :62.00 Max. :31.000
## WinnersVenue
## Length:125
## Class :character
## Mode :character
##
##
##

str(data1)

## 'data.frame': 125 obs. of 5 variables:
## $ Date : Date, format: "2014-03-09" "2013-03-16" …
## $ Winner : chr "England" "Wales" "Wales" "Wales" …
## $ WinnerScore : num 29 30 19 19 23 26 30 23 26 62 …
## $ LooserScore : num 18 3 12 9 19 19 17 15 19 5 …
## $ WinnersVenue: chr "Home" "Home" "Away" "Home" …

data1[, 'Winner'] <- as.factor(data1[, 'Winner'])
data1[, 'WinnersVenue'] <- as.factor(data1[, 'WinnersVenue'])
str(data1)

## 'data.frame': 125 obs. of 5 variables:
## $ Date : Date, format: "2014-03-09" "2013-03-16" …
## $ Winner : Factor w/ 3 levels "draw","England",..: 2 3 3 3 2 2 2 3 3 2 …
## $ WinnerScore : num 29 30 19 19 23 26 30 23 26 62 …
## $ LooserScore : num 18 3 12 9 19 19 17 15 19 5 …
## $ WinnersVenue: Factor w/ 2 levels "Away","Home": 2 2 1 2 2 1 2 2 1 2 …

data1$EnglandWins <- NULL
for(i in 1:nrow(data1)){
if (data1$Winner[i] == "England"){
data1$EnglandWins[i] <- 1
} else{
data1$EnglandWins[i] <- 0
}
}
# Done….

saveRDS(data1, file = "rugbyData1.rds", refhook = NULL)

# View Dataset
summary(data1)

## Date Winner WinnerScore LooserScore
## Min. :1881-02-19 draw :12 Min. : 0.00 Min. : 0.000
## 1st Qu.:1920-01-17 England:57 1st Qu.: 9.00 1st Qu.: 3.000
## Median :1958-01-18 Wales :56 Median :14.00 Median : 6.000
## Mean :1953-03-31 Mean :17.35 Mean : 7.304
## 3rd Qu.:1988-02-06 3rd Qu.:24.00 3rd Qu.:11.000
## Max. :2014-03-09 Max. :62.00 Max. :31.000
## WinnersVenue EnglandWins
## Away:53 Min. :0.000
## Home:72 1st Qu.:0.000
## Median :0.000
## Mean :0.456
## 3rd Qu.:1.000
## Max. :1.000

# Storing "Date", "Winner", "EnglandScore", "WalesScore", "GamesVenue", "WinnersVenue"
# in a separate data frame

data2 <- RugbyData[, c(1,3,7,8,10,11)]
summary(data2)

## Date Winner EnglandScore WalesScore
## Min. :1881-02-19 Length:125 Min. : 0.00 Min. : 0.0
## 1st Qu.:1920-01-17 Class :character 1st Qu.: 4.00 1st Qu.: 5.0
## Median :1958-01-18 Mode :character Median : 9.00 Median :10.0
## Mean :1953-03-31 Mean :13.06 Mean :11.6
## 3rd Qu.:1988-02-06 3rd Qu.:17.00 3rd Qu.:18.0
## Max. :2014-03-09 Max. :62.00 Max. :34.0
## GamesVenue WinnersVenue
## England:63 Length:125
## Wales :60 Class :character
## Other : 2 Mode :character
##
##
##

str(data2)

## 'data.frame': 125 obs. of 6 variables:
## $ Date : Date, format: "2014-03-09" "2013-03-16" …
## $ Winner : chr "England" "Wales" "Wales" "Wales" …
## $ EnglandScore: num 29 3 12 9 23 26 30 15 19 62 …
## $ WalesScore : num 18 30 19 19 19 19 17 23 26 5 …
## $ GamesVenue : Factor w/ 3 levels "England","Wales",..: 1 2 1 2 1 2 1 2 1 1 …
## $ WinnersVenue: chr "Home" "Home" "Away" "Home" …

data2[, 'Winner'] <- as.factor(data2[, 'Winner'])
data2[, 'WinnersVenue'] <- as.factor(data2[, 'WinnersVenue'])
str(data2)

## 'data.frame': 125 obs. of 6 variables:
## $ Date : Date, format: "2014-03-09" "2013-03-16" …
## $ Winner : Factor w/ 3 levels "draw","England",..: 2 3 3 3 2 2 2 3 3 2 …
## $ EnglandScore: num 29 3 12 9 23 26 30 15 19 62 …
## $ WalesScore : num 18 30 19 19 19 19 17 23 26 5 …
## $ GamesVenue : Factor w/ 3 levels "England","Wales",..: 1 2 1 2 1 2 1 2 1 1 …
## $ WinnersVenue: Factor w/ 2 levels "Away","Home": 2 2 1 2 2 1 2 2 1 2 …

data2$HomeVenue <- NULL
for(i in 1:nrow(data2)){
if (data2$WinnersVenue[i] == "Home"){
data2$HomeVenue[i] <- 1
} else{
data2$HomeVenue[i] <- 0
}
}

plot(data2$WinnersVenue~data2$Winner, col=c("red", "green"),
xlab="Winner Team", ylab="Venue of Game", main="Winning w.r.t. Home or Away Venue")

for(i in 1:nrow(data2)){
if (data2$WinnersVenue[i] == "Home"){
data2$HomeVenue[i] <- 1
} else{
data2$HomeVenue[i] <- 0
}
}

data2$Home <- NULL
for(i in 1:nrow(data2)){
if (data2$WinnersVenue[i] == "Home" && data2$Winner[i] == "England"){
data2$Home[i] <- "HomeEngland"
} else if (data2$WinnersVenue[i] == "Home" && data2$Winner[i] == "Wales"){
data2$Home[i] <- "HomeWales"
} else {
data2$Home[i] <- "HomeOther"
}
}
data2[, 'Home'] <- as.factor(data2[, 'Home'])

# Saving into rds
saveRDS(data2, file = "rugbyData2.rds", refhook = NULL)
rm(list=ls())

Data1 <- readRDS("rugbyData1.rds", refhook = NULL)
head(Data1)

## Date Winner WinnerScore LooserScore WinnersVenue EnglandWins
## 1 2014-03-09 England 29 18 Home 1
## 2 2013-03-16 Wales 30 3 Home 0
## 3 2012-02-25 Wales 19 12 Away 0
## 4 2011-08-13 Wales 19 9 Home 0
## 5 2011-08-06 England 23 19 Home 1
## 6 2011-02-04 England 26 19 Away 1

Data2 <- readRDS("rugbyData2.rds", refhook = NULL)
head(Data2)

## Date Winner EnglandScore WalesScore GamesVenue WinnersVenue
## 1 2014-03-09 England 29 18 England Home
## 2 2013-03-16 Wales 3 30 Wales Home
## 3 2012-02-25 Wales 12 19 England Away
## 4 2011-08-13 Wales 9 19 Wales Home
## 5 2011-08-06 England 23 19 England Home
## 6 2011-02-04 England 26 19 Wales Away
## HomeVenue Home
## 1 1 HomeEngland
## 2 1 HomeWales
## 3 0 HomeOther
## 4 1 HomeWales
## 5 1 HomeEngland
## 6 0 HomeOther

# Let us work on prediction now

COMING SOON…..
Part 2

Advertisements