Tags

, , , , , , , , ,

weather<- read.csv("M4_not_cleaned1.csv", header=TRUE)

#storing only date and weather into new dataframe
new.weather<- weather[,2:3]

#look if there are any missing data in the dateset
is.na(new.weather$AtmosphericPressure_mb)

# yes, there are many… in that case, 1) we can omit these NA's or 2), take a mean of one previous and one after values, then replace NA with that mean

new.weather$date<- lapply(strsplit(as.character(new.weather$time_UTC), "\\T"), "[", 1)
new.weather$time1<- lapply(strsplit(as.character(new.weather$time_UTC), "\\T"), "[", 2)

#removing Z from time
new.weather$time2<- lapply(strsplit(as.character(new.weather$time1), "\\Z"), "[", 2)
new.weather$time<- lapply(strsplit(as.character(new.weather$time1), "\\Z"), "[", 1)

#removing garbage, for your easy understanding I am removing these columns one by one
new.weather<- new.weather[,-4:-5]
#new.weather<- new.weather[,-5]
new.weather<- new.weather[,-1]

#Now reaarange the columns according to anmes (date, time and Atm Press)
new.weather<- new.weather[,c(2,3,1)]

## So far we were only cleaning and rearranging the dataset, now let us work on NaN or NA values

##################################################################################################
#If you want to see specific location (row number), use following functions:
#new.weather1 <- subset(new.weather, is.na(AtmosphericPressure_mb))
#where.nan <- which(new.weather$AtmosphericPressure_mb == "NaN")
#head(where.nan)
##################################################################################################

# First create another data frame, so that we still have untouched original cleaned and
# rearranged data frame
new.weather1<-new.weather

# then, into a vector store the mean of the time series data which does not have any missing data such as NA or NaN
mean1<- mean(new.weather$AtmosphericPressure_mb, na.rm=T)
mean1 #look at the mean of the time series without NA or NaN (missing) values
# results into:
# [1] 1010.266

# Then replace the missing (NaN) values in the time series by the mean of the series… but why with mean??
# Simply because the replacement with mean will not alter the overall mean of the series, whereas, if you
# replace missing value with ZERO will definitely alter the mean of the series, you can check on your own…

new.weather1$AtmosphericPressure_mb[which(is.nan(new.weather1$AtmosphericPressure_mb))] = mean1
# wnat to check whether mean is still the same mean, after replacing missing values…?? here is the check:
mean2<- mean(new.weather1$AtmosphericPressure_mb)
mean2
# which still results the same mean as earlier:
# [1] 1010.266

##########################################################################################
# when replaced missing values with ZERO, what will be the mean?
new.weather2<- new.weather
new.weather2$AtmosphericPressure_mb[which(is.nan(new.weather2$AtmosphericPressure_mb))] = 0

# wnat to check whether mean is still the same mean, after replacing missing values…?? here is the check:
mean3<- mean(new.weather2$AtmosphericPressure_mb)
mean3
# which still results into different mean than earlier:
# [1] 917.5748

# That's why I replaced missing value with the mean of the time series without NaN/NA's
# Got it??
##########################################################################################

# check if there are any missing values left in the replaced time series:
isnan<- which(is.nan(new.weather1$AtmosphericPressure_mb))
isnan
# this results into an empty integer vector:
# integer(0)
# So it is confirm that our new time series data new.weather1 does not have any missing values

#Similarly, we can check if there are any missing values in Date and Time column of the time series:
isnan1<- which(is.na(new.weather1$date))
isnan1
isnan2<- which(is.na(new.weather1$time))
isnan2
# in the both cases, you should receive integer(0) as result

################ NEXT STEP (will be explained in the PART 3 of this Tutorial Series #################

Advertisements