Primeiro Commit SentimentBR

Jodavid · Jun 6, 2021 · f8a9da2 · f8a9da2
commit f8a9da2
Show file tree

Hide file tree

Showing 25 changed files with 893 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,21 @@
+Package: sentimentBR
+Title: Tools for Sentiment Analysis in English and Portuguese
+Version: 0.3.0
+Authors@R: 
+      person("Jodavid", "Ferreira", email = "[email protected]",role = c("aut","cre"), comment = c(ORCID = "0000-0002-2131-6464"))
+Description: based on the sentiment package (https://cran.r-project.org/web/packages/sentiment/index.html)
+             of the "Timothy P. Jurka" which is archived on 2012-12-13,
+             this kept the use in English and was updated increasing the use in the Portuguese language of Brazil.
+License: GPL-3
+LazyLoad: yes
+Depends: 
+    R (>= 2.10),
+    tm,
+    Rstem
+RoxygenNote: 7.1.1
+Encoding: UTF-8
+Imports: 
+    stringi
+Suggests: 
+    testthat (>= 3.0.0)
+Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,5 @@
+exportPattern("^[[:alpha:]]+")
+exportPattern("^[^\\.]")
+import(Rstem)
+import(tm)
+importFrom("utils", "read.csv")
diff --git a/R/classify_emotion.R b/R/classify_emotion.R
@@ -0,0 +1,126 @@
+#' classifies the emotion (e.g. anger, disgust, fear, joy, sadness, surprise) of a set of texts.
+#'
+#' \code{classify_emotion} Classifies the emotion (e.g. anger, disgust, fear, joy, sadness, surprise) of a set of texts using a naive Bayes classifier trained on Carlo Strapparava and Alessandro Valitutti's \code{\link{emotions}} lexicon.
+#'
+#' @param textColumns A \code{data.frame} of text documents listed one per row.
+#' @param algorithm A \code{string} indicating whether to use the naive \code{bayes} algorithm or a simple \code{voter} algorithm.
+#' @param prior A \code{numeric} specifying the prior probability to use for the naive Bayes classifier.
+#' @param verbose A \code{logical} specifying whether to print detailed output regarding the classification process.
+#' @param lang Language, "en" for English and "pt" for Brazilian Portuguese.
+#' @param \dots Additional parameters to be passed into the \code{\link{create_matrix}} function.
+#'
+#' @return Returns an object of class \code{data.frame} with seven columns and one row for each document.
+#'        \item{anger}{The absolute log likelihood of the document expressing an angry sentiment.}
+#'        \item{disgust}{The absolute log likelihood of the document expressing a disgusted sentiment.}
+#'        \item{fear}{The absolute log likelihood of the document expressing a fearful sentiment.}
+#'        \item{joy}{The absolute log likelihood of the document expressing a joyous sentiment.}
+#'        \item{sadness}{The absolute log likelihood of the document expressing a sad sentiment.}
+#'        \item{surprise}{The absolute log likelihood of the document expressing a surprised sentiment.}
+#'        \item{best_fit}{The most likely sentiment category (e.g. anger, disgust, fear, joy, sadness, surprise) for the given text.}
+#'
+#' @author Timothy P. Jurka <tpjurka@@ucdavis.edu> and
+#'         Jodavid Ferreira <jdaf1@@de.ufpe.br>
+#'
+#'
+#' @examples
+#' # DEFINE DOCUMENTS
+#' documents <- c("I am very happy, excited, and optimistic.",
+#'                "I am very scared, annoyed, and irritated.")
+#'
+#' # CLASSIFY EMOTIONS
+#' classify_emotion(documents,algorithm="bayes",verbose=TRUE, lang = "en")
+#'
+#'
+classify_emotion <- function(textColumns,algorithm="bayes",prior=1.0,verbose=FALSE,lang = "en",...) {
+	matrix <- create_matrix(textColumns,...)
+
+	if(lang == "en"){
+	lexicon <- read.csv(system.file("data/emotions.csv.gz",package="sentimentBR"),header=FALSE)
+	counts <- list(anger=length(which(lexicon[,2]=="anger")),disgust=length(which(lexicon[,2]=="disgust")),fear=length(which(lexicon[,2]=="fear")),joy=length(which(lexicon[,2]=="joy")),sadness=length(which(lexicon[,2]=="sadness")),surprise=length(which(lexicon[,2]=="surprise")),total=nrow(lexicon))
+	}else if(lang == "pt"){
+	  lexicon <- read.csv(system.file("data/emotionspt.csv.gz",package="sentimentBR"),header=FALSE)
+	  counts <- list(anger=length(which(lexicon[,2]=="anger")),disgust=length(which(lexicon[,2]=="disgust")),fear=length(which(lexicon[,2]=="fear")),joy=length(which(lexicon[,2]=="joy")),sadness=length(which(lexicon[,2]=="sadness")),surprise=length(which(lexicon[,2]=="surprise")),total=nrow(lexicon))
+	}
+
+	# ----------------
+	lexicon[,1] <- rm_accent(lexicon[,1])
+	documents <- c()
+	# ----------------
+
+	for (i in 1:nrow(matrix)) {
+		if (verbose) print(paste("DOCUMENT",i))
+		scores <- list(anger=0,disgust=0,fear=0,joy=0,sadness=0,surprise=0)
+		doc <- matrix[i,]
+		words <- findFreqTerms(doc,lowfreq=1)
+
+		# ----------------
+		words <- rm_accent(words)
+		# ---------------
+
+		for (word in words) {
+            for (key in names(scores)) {
+                emotions <- lexicon[which(lexicon[,2]==key),]
+                index <- pmatch(word,emotions[,1],nomatch=0)
+                if (index > 0) {
+                    entry <- emotions[index,]
+
+                    category <- as.character(entry[[2]])
+                    count <- counts[[category]]
+
+                    score <- 1.0
+                    if (algorithm=="bayes") score <- abs(log(score*prior/count))
+
+                    if (verbose) {
+                        print(paste("WORD:",word,"CAT:",category,"SCORE:",score))
+                    }
+
+                    scores[[category]] <- scores[[category]]+score
+                }
+            }
+        }
+
+        if (algorithm=="bayes") {
+            for (key in names(scores)) {
+                count <- counts[[key]]
+                total <- counts[["total"]]
+                score <- abs(log(count/total))
+                scores[[key]] <- scores[[key]]+score
+            }
+        } else {
+            for (key in names(scores)) {
+                scores[[key]] <- scores[[key]]+0.000001
+            }
+        }
+
+        best_fit <- names(scores)[which.max(unlist(scores))]
+        if (best_fit == "disgust" && as.numeric(unlist(scores[2]))-3.09234 < .01) best_fit <- NA
+		documents <- rbind(documents,c(scores$anger,scores$disgust,scores$fear,scores$joy,scores$sadness,scores$surprise,best_fit))
+	}
+
+	if(lang == "en"){
+	  colnames(documents) <- c("ANGER","DISGUST","FEAR","JOY","SADNESS","SURPRISE","BEST_FIT")
+	}else if(lang == "pt"){
+	  #-------------------------
+	  class <- function(x){
+
+	    vetor <- array(NA, dim = length(x))
+	    for( i in 1:length(x)){
+	      vetor[i] <- switch (x[i],
+	                          "anger" = "raiva",
+	                          "disgust" = "desgosto",
+	                          "fear" = "medo",
+	                          "joy" = "alegria",
+	                          "sandness" = "triteza",
+	                          "surprise" = "surpresa",
+	                          "NA" = NA
+	      )
+	    }
+	    return(vetor)
+
+	  }
+	  #-------------------------
+	  colnames(documents) <- c("RAIVA","DESGOSTO","MEDO","ALEGRIA","TRISTEZA","SURPRESA","BEST_FIT")
+	  documents[,7] <- class(documents[,7])
+  }
+	return(documents)
+}
diff --git a/R/classify_polarity.R b/R/classify_polarity.R
@@ -0,0 +1,106 @@
+#' classifies the polarity (e.g. positive or negative) of a set of texts.
+#'
+#' \code{classify_polarity} Classifies the polarity (e.g. positive or negative) of a set of texts using a naive Bayes classifier trained on Janyce Wiebe's \code{\link{subjectivity}} lexicon.
+#'
+#' @param textColumns A \code{data.frame} of text documents listed one per row.
+#' @param algorithm A \code{string} indicating whether to use the naive \code{bayes} algorithm or a simple \code{voter} algorithm.
+#' @param pstrong A \code{numeric} specifying the probability that a strongly subjective term appears in the given text.
+#' @param pweak A \code{numeric} specifying the probability that a weakly subjective term appears in the given text.
+#' @param prior A \code{numeric} specifying the prior probability to use for the naive Bayes classifier.
+#' @param verbose A \code{logical} specifying whether to print detailed output regarding the classification process.
+#' @param lang Language, "en" for English and "pt" for Brazilian Portuguese.
+#' @param \dots Additional parameters to be passed into the \code{\link{create_matrix}} function.
+#'
+#' @return Returns an object of class \code{data.frame} with four columns and one row for each document.
+#'        \item{pos}{The absolute log likelihood of the document expressing a positive sentiment.}
+#'        \item{neg}{The absolute log likelihood of the document expressing a negative sentiment.}
+#'        \item{pos/neg}{The ratio of absolute log likelihoods between positive and negative sentiment scores. A score of 1 indicates a neutral sentiment, less than 1 indicates a negative sentiment, and greater than 1 indicates a positive sentiment.}
+#'        \item{best_fit}{The most likely sentiment category (e.g. positive, negative, neutral) for the given text.}
+#'
+#' @author Timothy P. Jurka <tpjurka@@ucdavis.edu> and
+#'         Jodavid Ferreira <jdaf1@@de.ufpe.br>
+#'
+#'
+#' @examples
+#' # DEFINE DOCUMENTS
+#' documents <- c("I am very happy, excited, and optimistic.",
+#'                "I am very scared, annoyed, and irritated.")
+#'
+#' # CLASSIFY POLARITY
+#  classify_polarity(documents,algorithm="bayes",verbose=TRUE, lang = "en")
+#'
+#'
+classify_polarity <- function(textColumns,algorithm="bayes",pstrong=0.5,pweak=1.0,prior=1.0,verbose=FALSE,lang = "en",...) {
+	matrix <- create_matrix(textColumns,...)
+
+	if(lang == "en"){
+	  lexicon <- read.csv(system.file("data/subjectivity.csv.gz",package="sentimentBR"),header=FALSE)
+	  counts <- list(positive=length(which(lexicon[,3]=="positive")),negative=length(which(lexicon[,3]=="negative")),total=nrow(lexicon))
+	}else if(lang == "pt"){
+	  lexicon <- read.csv(system.file("data/subjectivitypt.csv.gz",package="sentimentBR"),header=FALSE)
+	  counts <- list(positive=length(which(lexicon[,3]=="positive")),negative=length(which(lexicon[,3]=="negative")),total=nrow(lexicon))
+	}
+
+	# ----------------
+	lexicon[,1] <- rm_accent(lexicon[,1])
+	documents <- c()
+	# ----------------
+
+
+	for (i in 1:nrow(matrix)) {
+		if (verbose) print(paste("DOCUMENT",i))
+		scores <- list(positive=0,negative=0)
+		doc <- matrix[i,]
+		words <- findFreqTerms(doc,lowfreq=1)
+
+		# ----------------
+		words <- rm_accent(words)
+		# ---------------
+
+		for (word in words) {
+			index <- pmatch(word,lexicon[,1],nomatch=0)
+			if (index > 0) {
+				entry <- lexicon[index,]
+
+				polarity <- as.character(entry[[2]])
+				category <- as.character(entry[[3]])
+				count <- counts[[category]]
+
+				score <- pweak
+                if (polarity == "strongsubj") score <- pstrong
+				if (algorithm=="bayes") score <- abs(log(score*prior/count))
+
+				if (verbose) {
+                    print(paste("WORD:",word,"CAT:",category,"POL:",polarity,"SCORE:",score))
+				}
+
+				scores[[category]] <- scores[[category]]+score
+			}
+		}
+
+		if (algorithm=="bayes") {
+			for (key in names(scores)) {
+				count <- counts[[key]]
+				total <- counts[["total"]]
+				score <- abs(log(count/total))
+				scores[[key]] <- scores[[key]]+score
+			}
+		} else {
+			for (key in names(scores)) {
+				scores[[key]] <- scores[[key]]+0.000001
+			}
+		}
+
+        best_fit <- names(scores)[which.max(unlist(scores))]
+        ratio <- as.integer(abs(scores$positive/scores$negative))
+        if (ratio==1) best_fit <- "neutral"
+		documents <- rbind(documents,c(scores$positive,scores$negative,abs(scores$positive/scores$negative),best_fit))
+		if (verbose) {
+			print(paste("POS:",scores$positive,"NEG:",scores$negative,"RATIO:",abs(scores$positive/scores$negative)))
+			cat("\n")
+		}
+	}
+
+	colnames(documents) <- c("POS","NEG","POS/NEG","BEST_FIT")
+	return(documents)
+}
diff --git a/R/create_matrix.R b/R/create_matrix.R
@@ -0,0 +1,55 @@
+#' Creates an object of class \code{DocumentTermMatrix} from \pkg{tm}.
+#'
+#' \code{create_matrix} creates a document-term matrix
+#'
+#' @param textColumns Either character vector (e.g. data$Title) or a \code{cbind()} of columns to use for training the algorithms (e.g. \code{cbind(data$Title,data$Subject)}).
+#' @param language The language to be used for stemming the text data.
+#' @param minDocFreq The minimum number of times a word should appear in a document for it to be included in the matrix. See package \pkg{tm} for more details.
+#' @param minWordLength The  minimum number of letters a word should contain to be included in the matrix. See package \pkg{tm} for more details.
+#' @param removeNumbers A \code{logical} parameter to specify whether to remove numbers.
+#' @param removePunctuation A \code{logical} parameter to specify whether to remove punctuation.
+#' @param removeSparseTerms See package \pkg{tm} for more details.
+#' @param removeStopwords A \code{logical} parameter to specify whether to remove stopwords using the language specified in language.
+#' @param stemWords A \code{logical} parameter to specify whether to stem words using the language specified in language.
+#' @param stripWhitespace A \code{logical} parameter to specify whether to strip whitespace.
+#' @param toLower A \code{logical} parameter to specify whether to make all text lowercase.
+#' @param weighting Either \code{weightTf} or \code{weightTfIdf}. See package \pkg{tm} for more details.
+#'
+#' @author Timothy P. Jurka <tpjurka@@ucdavis.edu>
+#'
+#' @examples
+#' # DEFINE THE DOCUMENTS
+#'
+#' documents <- c("I am very happy, excited, and optimistic.",
+#' "I am very scared, annoyed, and irritated.",
+#' "Iraq's political crisis entered its second week one step closer to the potential
+#' dissolution of the government, with a call for elections by a vital coalition partner
+#' and a suicide attack that extended the spate of violence that has followed the withdrawal
+#' of U.S. troops.",
+#' "With nightfall approaching, Los Angeles authorities are urging residents to keep their
+#' outdoor lights on as police and fire officials try to catch the person or people responsible
+#' for nearly 40 arson fires in the last three days.")
+#' matrix <- create_matrix(documents, language="english", removeNumbers=TRUE,
+#'                         stemWords=FALSE, weighting=weightTfIdf)
+#'
+create_matrix <- function(textColumns, language="english", minDocFreq=1, minWordLength=3, removeNumbers=TRUE, removePunctuation=TRUE, removeSparseTerms=0, removeStopwords=TRUE, stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE, weighting=weightTf) {
+
+    stem_words <- function(x) {
+        split <- strsplit(x," ")
+        return(wordStem(split[[1]],language=language))
+    }
+
+	control <- list(language=language,tolower=toLower,removeNumbers=removeNumbers,removePunctuation=removePunctuation,stripWhitespace=stripWhitespace,minWordLength=minWordLength,stopwords=removeStopwords,minDocFreq=minDocFreq,weighting=weighting)
+
+    if (stemWords == TRUE) control <- append(control,list(stemming=stem_words),after=6)
+
+    trainingColumn <- apply(as.matrix(textColumns),1,paste,collapse=" ")
+    trainingColumn <- sapply(as.vector(trainingColumn,mode="character"),iconv,to="UTF8",sub="byte")
+
+	corpus <- Corpus(VectorSource(trainingColumn),readerControl=list(language=language))
+	matrix <- DocumentTermMatrix(corpus,control=control);
+    if (removeSparseTerms > 0) matrix <- removeSparseTerms(matrix,removeSparseTerms)
+
+	gc()
+	return(matrix)
+}
diff --git a/R/rm_accent.R b/R/rm_accent.R
@@ -0,0 +1,66 @@
+#' Function to remove accents from words
+#'
+#' \code{rm_accent} Function to remove accents from words
+#'
+#' @param str vector of strings that will have their accents removed.
+#' @param pattern string vector with one or more elements indicating which accents should be removed. To indicate which accents should be removed, a vector with the symbols should be passed. Example: pattern = c("´", "^") will strip out the acute and caret accents only. Other accepted words: "all" (remove all accents, which are "´", "`", "^", "~", "¨", "ç")
+#'
+#' @return vector without accent
+#'
+#' @examples
+#' x <- "São Paulo"
+#'
+#' y <- rm_accent(x, pattern = "'")
+#' print(y)
+#'
+#' y <- rm_accent(x, pattern = "~")
+#' print(y)
+#'
+#' y <- rm_accent(x, pattern = "all")
+#' print(y)
+#'
+rm_accent <- function(str, pattern = "all") {
+
+  if(!is.character(str))
+    str <- as.character(str)
+
+  pattern <- unique(pattern)
+
+  if(any(pattern == stringi::stri_unescape_unicode("\\u00c7") ))
+    pattern[pattern == stringi::stri_unescape_unicode("\\u00c7") ] <- stringi::stri_unescape_unicode("\\u00e7")
+
+  symbols <- c(
+    acute = "\\u00e1\\u00e9\\u00ed\\u00f3\\u00fa\\u00c1\\u00c9\\u00cd\\u00d3\\u00da\\u00fd\\u00dd",
+    grave = "\\u00e0\\u00e8\\u00ec\\u00f2\\u00f9\\u00c0\\u00c8\\u00cc\\u00d2\\u00d9",
+    circunflex = "\\u00e2\\u00ea\\u00ee\\u00f4\\u00fb\\u00c2\\u00ca\\u00ce\\u00d4\\u00db",
+    tilde = "\\u00e3\\u00f5\\u00c3\\u00d5\\u00f1\\u00d1",
+    umlaut = "\\u00e4\\u00eb\\u00ef\\u00f6\\u00fc\\u00c4\\u00cb\\u00cf\\u00d6\\u00dc\\u00ff",
+    cedil = "\\u00e7\\u00c7"
+  )
+
+  nudeSymbols <- c(
+    acute = "aeiouAEIOUyY",
+    grave = "aeiouAEIOU",
+    circunflex = "aeiouAEIOU",
+    tilde = "aoAOnN",
+    umlaut = "aeiouAEIOUy",
+    cedil = "cC"
+  )
+
+  accentTypes <- c(stringi::stri_unescape_unicode("\\u00b4"),
+                   "`","^","~",
+                   stringi::stri_unescape_unicode("\\u00a8"),
+                   stringi::stri_unescape_unicode("\\u00e7"))
+
+  if(any(c("all","al","a","todos","t","to","tod","todo")%in%pattern))
+    return(chartr(paste(
+      stringi::stri_unescape_unicode(symbols),
+      collapse=""), paste(nudeSymbols, collapse=""), str))
+
+  for(i in which(accentTypes%in%pattern))
+    str <- chartr(
+      stringi::stri_unescape_unicode(symbols[i]),
+      nudeSymbols[i], str)
+
+  return(str)
+}