Text Mining – RegEx Example

library(stringr)

x <- "There is a horror movie running in the iNox theater. : If row names are supplied of length one and the data
frame has a single row, the row.names is taken to specify the row names and not a column (by name or number).
If row names are supplied of length one and the data frame has a single row, the row.names is taken to specify
the row names and not a column (by name or number) Can we go : Please"

x <- gsub("[\r\n]", "", x)

y <- "There is a horror movie running in the iNox theater. If row names are supplied of length one and the data
frame has a single row, the row.names is taken. To specify the row names and not a column. By name or number. :
If row names are supplied of length one and the data frame has a single row, the row.names is taken to specify
the row names and not a column (by name or number) Can we go : Please"

y <- gsub("[\r\n]", "", y)

z <- "There is a horror movie running in the iNox theater. If row names are supplied of length one and the data frame has a single row, the row.names is taken to specify the row names and not a column (by name or number).
If row names are supplied of length one. : And the data frame has a single row, the row.names is taken to specify
the row names and not a column (by name or number) Can we go : Please"

z <- gsub("[\r\n]", "", z)

df <- data.frame(Text = c(x, y, z), row.names = NULL, stringsAsFactors = F)

#
#
# processData <- function(a) {
# patt <- "^(?s)(?!(?:(?:[^:]*?\\.){3,}))(.*?):(.*)$"
# if(grepl(patt, a, perl=TRUE))
# {
# result <- str_match(a, patt)
# res <- c(result[2], result[3])
# #res <- data.frame(col1=result[2], col2=result[3], stringsAsFactors = F)
# }
# else
# {
# res <- c("NA", a)
# }
# return(res)
# }
#

resDF <- data.frame("Col1" = character(), "Col2" = character(), stringsAsFactors=FALSE)

processData <- function(a) {
patt <- "^(?s)(?!(?:(?:[^:]*?\\.){3,}))(.*?):(.*)$"
if(grepl(patt,a,perl=TRUE))
{
result<-str_match(a,patt)
col1<-result[2]
col2<-result[3]
}
else
{
col1<-"NA"
col2<-a
}
return(c(col1,col2))

}

# resDF[nrow(resDF) + 1, ] <- processData(x)
# resDF[nrow(resDF) + 1, ] <- processData(y)
# resDF[nrow(resDF) + 1, ] <- processData(z)

resDF <- data.frame("Col1" = character(), "Col2" = character(), stringsAsFactors=FALSE)

for (i in 1:nrow(df)){
x <- df[i, ]
resDF[nrow(resDF) + 1, ] <- processData(x)
}

Advertisements