**Tags**

Data Clean, Forecasting, Missing Values, NA, NaN, R, Rearranging Columns, replacing, Splitting Data in a Column, Tutorial

weather<- read.csv("M4_not_cleaned1.csv", header=TRUE)

#storing only date and weather into new dataframe

new.weather<- weather[,2:3]

#look if there are any missing data in the dateset

is.na(new.weather$AtmosphericPressure_mb)

# yes, there are many… in that case, 1) we can omit these NA's or 2), take a mean of one previous and one after values, then replace NA with that mean

new.weather$date<- lapply(strsplit(as.character(new.weather$time_UTC), "\\T"), "[", 1)

new.weather$time1<- lapply(strsplit(as.character(new.weather$time_UTC), "\\T"), "[", 2)

#removing Z from time

new.weather$time2<- lapply(strsplit(as.character(new.weather$time1), "\\Z"), "[", 2)

new.weather$time<- lapply(strsplit(as.character(new.weather$time1), "\\Z"), "[", 1)

#removing garbage, for your easy understanding I am removing these columns one by one

new.weather<- new.weather[,-4:-5]

#new.weather<- new.weather[,-5]

new.weather<- new.weather[,-1]

#Now reaarange the columns according to anmes (date, time and Atm Press)

new.weather<- new.weather[,c(2,3,1)]

## So far we were only cleaning and rearranging the dataset, now let us work on NaN or NA values

##################################################################################################

#If you want to see specific location (row number), use following functions:

#new.weather1 <- subset(new.weather, is.na(AtmosphericPressure_mb))

#where.nan <- which(new.weather$AtmosphericPressure_mb == "NaN")

#head(where.nan)

##################################################################################################

# First create another data frame, so that we still have untouched original cleaned and

# rearranged data frame

new.weather1<-new.weather

# then, into a vector store the mean of the time series data which does not have any missing data such as NA or NaN

mean1<- mean(new.weather$AtmosphericPressure_mb, na.rm=T)

mean1 #look at the mean of the time series without NA or NaN (missing) values

# results into:

# [1] 1010.266

# Then replace the missing (NaN) values in the time series by the mean of the series… but why with mean??

# Simply because the replacement with mean will not alter the overall mean of the series, whereas, if you

# replace missing value with ZERO will definitely alter the mean of the series, you can check on your own…

new.weather1$AtmosphericPressure_mb[which(is.nan(new.weather1$AtmosphericPressure_mb))] = mean1

# wnat to check whether mean is still the same mean, after replacing missing values…?? here is the check:

mean2<- mean(new.weather1$AtmosphericPressure_mb)

mean2

# which still results the same mean as earlier:

# [1] 1010.266

##########################################################################################

# when replaced missing values with ZERO, what will be the mean?

new.weather2<- new.weather

new.weather2$AtmosphericPressure_mb[which(is.nan(new.weather2$AtmosphericPressure_mb))] = 0

# wnat to check whether mean is still the same mean, after replacing missing values…?? here is the check:

mean3<- mean(new.weather2$AtmosphericPressure_mb)

mean3

# which still results into different mean than earlier:

# [1] 917.5748

# That's why I replaced missing value with the mean of the time series without NaN/NA's

# Got it??

##########################################################################################

# check if there are any missing values left in the replaced time series:

isnan<- which(is.nan(new.weather1$AtmosphericPressure_mb))

isnan

# this results into an empty integer vector:

# integer(0)

# So it is confirm that our new time series data new.weather1 does not have any missing values

#Similarly, we can check if there are any missing values in Date and Time column of the time series:

isnan1<- which(is.na(new.weather1$date))

isnan1

isnan2<- which(is.na(new.weather1$time))

isnan2

# in the both cases, you should receive integer(0) as result

################ NEXT STEP (will be explained in the PART 3 of this Tutorial Series #################