-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit f8a9da2
Showing
25 changed files
with
893 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
Package: sentimentBR | ||
Title: Tools for Sentiment Analysis in English and Portuguese | ||
Version: 0.3.0 | ||
Authors@R: | ||
person("Jodavid", "Ferreira", email = "[email protected]",role = c("aut","cre"), comment = c(ORCID = "0000-0002-2131-6464")) | ||
Description: based on the sentiment package (https://cran.r-project.org/web/packages/sentiment/index.html) | ||
of the "Timothy P. Jurka" which is archived on 2012-12-13, | ||
this kept the use in English and was updated increasing the use in the Portuguese language of Brazil. | ||
License: GPL-3 | ||
LazyLoad: yes | ||
Depends: | ||
R (>= 2.10), | ||
tm, | ||
Rstem | ||
RoxygenNote: 7.1.1 | ||
Encoding: UTF-8 | ||
Imports: | ||
stringi | ||
Suggests: | ||
testthat (>= 3.0.0) | ||
Config/testthat/edition: 3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
exportPattern("^[[:alpha:]]+") | ||
exportPattern("^[^\\.]") | ||
import(Rstem) | ||
import(tm) | ||
importFrom("utils", "read.csv") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
#' classifies the emotion (e.g. anger, disgust, fear, joy, sadness, surprise) of a set of texts. | ||
#' | ||
#' \code{classify_emotion} Classifies the emotion (e.g. anger, disgust, fear, joy, sadness, surprise) of a set of texts using a naive Bayes classifier trained on Carlo Strapparava and Alessandro Valitutti's \code{\link{emotions}} lexicon. | ||
#' | ||
#' @param textColumns A \code{data.frame} of text documents listed one per row. | ||
#' @param algorithm A \code{string} indicating whether to use the naive \code{bayes} algorithm or a simple \code{voter} algorithm. | ||
#' @param prior A \code{numeric} specifying the prior probability to use for the naive Bayes classifier. | ||
#' @param verbose A \code{logical} specifying whether to print detailed output regarding the classification process. | ||
#' @param lang Language, "en" for English and "pt" for Brazilian Portuguese. | ||
#' @param \dots Additional parameters to be passed into the \code{\link{create_matrix}} function. | ||
#' | ||
#' @return Returns an object of class \code{data.frame} with seven columns and one row for each document. | ||
#' \item{anger}{The absolute log likelihood of the document expressing an angry sentiment.} | ||
#' \item{disgust}{The absolute log likelihood of the document expressing a disgusted sentiment.} | ||
#' \item{fear}{The absolute log likelihood of the document expressing a fearful sentiment.} | ||
#' \item{joy}{The absolute log likelihood of the document expressing a joyous sentiment.} | ||
#' \item{sadness}{The absolute log likelihood of the document expressing a sad sentiment.} | ||
#' \item{surprise}{The absolute log likelihood of the document expressing a surprised sentiment.} | ||
#' \item{best_fit}{The most likely sentiment category (e.g. anger, disgust, fear, joy, sadness, surprise) for the given text.} | ||
#' | ||
#' @author Timothy P. Jurka <tpjurka@@ucdavis.edu> and | ||
#' Jodavid Ferreira <jdaf1@@de.ufpe.br> | ||
#' | ||
#' | ||
#' @examples | ||
#' # DEFINE DOCUMENTS | ||
#' documents <- c("I am very happy, excited, and optimistic.", | ||
#' "I am very scared, annoyed, and irritated.") | ||
#' | ||
#' # CLASSIFY EMOTIONS | ||
#' classify_emotion(documents,algorithm="bayes",verbose=TRUE, lang = "en") | ||
#' | ||
#' | ||
classify_emotion <- function(textColumns,algorithm="bayes",prior=1.0,verbose=FALSE,lang = "en",...) { | ||
matrix <- create_matrix(textColumns,...) | ||
|
||
if(lang == "en"){ | ||
lexicon <- read.csv(system.file("data/emotions.csv.gz",package="sentimentBR"),header=FALSE) | ||
counts <- list(anger=length(which(lexicon[,2]=="anger")),disgust=length(which(lexicon[,2]=="disgust")),fear=length(which(lexicon[,2]=="fear")),joy=length(which(lexicon[,2]=="joy")),sadness=length(which(lexicon[,2]=="sadness")),surprise=length(which(lexicon[,2]=="surprise")),total=nrow(lexicon)) | ||
}else if(lang == "pt"){ | ||
lexicon <- read.csv(system.file("data/emotionspt.csv.gz",package="sentimentBR"),header=FALSE) | ||
counts <- list(anger=length(which(lexicon[,2]=="anger")),disgust=length(which(lexicon[,2]=="disgust")),fear=length(which(lexicon[,2]=="fear")),joy=length(which(lexicon[,2]=="joy")),sadness=length(which(lexicon[,2]=="sadness")),surprise=length(which(lexicon[,2]=="surprise")),total=nrow(lexicon)) | ||
} | ||
|
||
# ---------------- | ||
lexicon[,1] <- rm_accent(lexicon[,1]) | ||
documents <- c() | ||
# ---------------- | ||
|
||
for (i in 1:nrow(matrix)) { | ||
if (verbose) print(paste("DOCUMENT",i)) | ||
scores <- list(anger=0,disgust=0,fear=0,joy=0,sadness=0,surprise=0) | ||
doc <- matrix[i,] | ||
words <- findFreqTerms(doc,lowfreq=1) | ||
|
||
# ---------------- | ||
words <- rm_accent(words) | ||
# --------------- | ||
|
||
for (word in words) { | ||
for (key in names(scores)) { | ||
emotions <- lexicon[which(lexicon[,2]==key),] | ||
index <- pmatch(word,emotions[,1],nomatch=0) | ||
if (index > 0) { | ||
entry <- emotions[index,] | ||
|
||
category <- as.character(entry[[2]]) | ||
count <- counts[[category]] | ||
|
||
score <- 1.0 | ||
if (algorithm=="bayes") score <- abs(log(score*prior/count)) | ||
|
||
if (verbose) { | ||
print(paste("WORD:",word,"CAT:",category,"SCORE:",score)) | ||
} | ||
|
||
scores[[category]] <- scores[[category]]+score | ||
} | ||
} | ||
} | ||
|
||
if (algorithm=="bayes") { | ||
for (key in names(scores)) { | ||
count <- counts[[key]] | ||
total <- counts[["total"]] | ||
score <- abs(log(count/total)) | ||
scores[[key]] <- scores[[key]]+score | ||
} | ||
} else { | ||
for (key in names(scores)) { | ||
scores[[key]] <- scores[[key]]+0.000001 | ||
} | ||
} | ||
|
||
best_fit <- names(scores)[which.max(unlist(scores))] | ||
if (best_fit == "disgust" && as.numeric(unlist(scores[2]))-3.09234 < .01) best_fit <- NA | ||
documents <- rbind(documents,c(scores$anger,scores$disgust,scores$fear,scores$joy,scores$sadness,scores$surprise,best_fit)) | ||
} | ||
|
||
if(lang == "en"){ | ||
colnames(documents) <- c("ANGER","DISGUST","FEAR","JOY","SADNESS","SURPRISE","BEST_FIT") | ||
}else if(lang == "pt"){ | ||
#------------------------- | ||
class <- function(x){ | ||
|
||
vetor <- array(NA, dim = length(x)) | ||
for( i in 1:length(x)){ | ||
vetor[i] <- switch (x[i], | ||
"anger" = "raiva", | ||
"disgust" = "desgosto", | ||
"fear" = "medo", | ||
"joy" = "alegria", | ||
"sandness" = "triteza", | ||
"surprise" = "surpresa", | ||
"NA" = NA | ||
) | ||
} | ||
return(vetor) | ||
|
||
} | ||
#------------------------- | ||
colnames(documents) <- c("RAIVA","DESGOSTO","MEDO","ALEGRIA","TRISTEZA","SURPRESA","BEST_FIT") | ||
documents[,7] <- class(documents[,7]) | ||
} | ||
return(documents) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
#' classifies the polarity (e.g. positive or negative) of a set of texts. | ||
#' | ||
#' \code{classify_polarity} Classifies the polarity (e.g. positive or negative) of a set of texts using a naive Bayes classifier trained on Janyce Wiebe's \code{\link{subjectivity}} lexicon. | ||
#' | ||
#' @param textColumns A \code{data.frame} of text documents listed one per row. | ||
#' @param algorithm A \code{string} indicating whether to use the naive \code{bayes} algorithm or a simple \code{voter} algorithm. | ||
#' @param pstrong A \code{numeric} specifying the probability that a strongly subjective term appears in the given text. | ||
#' @param pweak A \code{numeric} specifying the probability that a weakly subjective term appears in the given text. | ||
#' @param prior A \code{numeric} specifying the prior probability to use for the naive Bayes classifier. | ||
#' @param verbose A \code{logical} specifying whether to print detailed output regarding the classification process. | ||
#' @param lang Language, "en" for English and "pt" for Brazilian Portuguese. | ||
#' @param \dots Additional parameters to be passed into the \code{\link{create_matrix}} function. | ||
#' | ||
#' @return Returns an object of class \code{data.frame} with four columns and one row for each document. | ||
#' \item{pos}{The absolute log likelihood of the document expressing a positive sentiment.} | ||
#' \item{neg}{The absolute log likelihood of the document expressing a negative sentiment.} | ||
#' \item{pos/neg}{The ratio of absolute log likelihoods between positive and negative sentiment scores. A score of 1 indicates a neutral sentiment, less than 1 indicates a negative sentiment, and greater than 1 indicates a positive sentiment.} | ||
#' \item{best_fit}{The most likely sentiment category (e.g. positive, negative, neutral) for the given text.} | ||
#' | ||
#' @author Timothy P. Jurka <tpjurka@@ucdavis.edu> and | ||
#' Jodavid Ferreira <jdaf1@@de.ufpe.br> | ||
#' | ||
#' | ||
#' @examples | ||
#' # DEFINE DOCUMENTS | ||
#' documents <- c("I am very happy, excited, and optimistic.", | ||
#' "I am very scared, annoyed, and irritated.") | ||
#' | ||
#' # CLASSIFY POLARITY | ||
# classify_polarity(documents,algorithm="bayes",verbose=TRUE, lang = "en") | ||
#' | ||
#' | ||
classify_polarity <- function(textColumns,algorithm="bayes",pstrong=0.5,pweak=1.0,prior=1.0,verbose=FALSE,lang = "en",...) { | ||
matrix <- create_matrix(textColumns,...) | ||
|
||
if(lang == "en"){ | ||
lexicon <- read.csv(system.file("data/subjectivity.csv.gz",package="sentimentBR"),header=FALSE) | ||
counts <- list(positive=length(which(lexicon[,3]=="positive")),negative=length(which(lexicon[,3]=="negative")),total=nrow(lexicon)) | ||
}else if(lang == "pt"){ | ||
lexicon <- read.csv(system.file("data/subjectivitypt.csv.gz",package="sentimentBR"),header=FALSE) | ||
counts <- list(positive=length(which(lexicon[,3]=="positive")),negative=length(which(lexicon[,3]=="negative")),total=nrow(lexicon)) | ||
} | ||
|
||
# ---------------- | ||
lexicon[,1] <- rm_accent(lexicon[,1]) | ||
documents <- c() | ||
# ---------------- | ||
|
||
|
||
for (i in 1:nrow(matrix)) { | ||
if (verbose) print(paste("DOCUMENT",i)) | ||
scores <- list(positive=0,negative=0) | ||
doc <- matrix[i,] | ||
words <- findFreqTerms(doc,lowfreq=1) | ||
|
||
# ---------------- | ||
words <- rm_accent(words) | ||
# --------------- | ||
|
||
for (word in words) { | ||
index <- pmatch(word,lexicon[,1],nomatch=0) | ||
if (index > 0) { | ||
entry <- lexicon[index,] | ||
|
||
polarity <- as.character(entry[[2]]) | ||
category <- as.character(entry[[3]]) | ||
count <- counts[[category]] | ||
|
||
score <- pweak | ||
if (polarity == "strongsubj") score <- pstrong | ||
if (algorithm=="bayes") score <- abs(log(score*prior/count)) | ||
|
||
if (verbose) { | ||
print(paste("WORD:",word,"CAT:",category,"POL:",polarity,"SCORE:",score)) | ||
} | ||
|
||
scores[[category]] <- scores[[category]]+score | ||
} | ||
} | ||
|
||
if (algorithm=="bayes") { | ||
for (key in names(scores)) { | ||
count <- counts[[key]] | ||
total <- counts[["total"]] | ||
score <- abs(log(count/total)) | ||
scores[[key]] <- scores[[key]]+score | ||
} | ||
} else { | ||
for (key in names(scores)) { | ||
scores[[key]] <- scores[[key]]+0.000001 | ||
} | ||
} | ||
|
||
best_fit <- names(scores)[which.max(unlist(scores))] | ||
ratio <- as.integer(abs(scores$positive/scores$negative)) | ||
if (ratio==1) best_fit <- "neutral" | ||
documents <- rbind(documents,c(scores$positive,scores$negative,abs(scores$positive/scores$negative),best_fit)) | ||
if (verbose) { | ||
print(paste("POS:",scores$positive,"NEG:",scores$negative,"RATIO:",abs(scores$positive/scores$negative))) | ||
cat("\n") | ||
} | ||
} | ||
|
||
colnames(documents) <- c("POS","NEG","POS/NEG","BEST_FIT") | ||
return(documents) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#' Creates an object of class \code{DocumentTermMatrix} from \pkg{tm}. | ||
#' | ||
#' \code{create_matrix} creates a document-term matrix | ||
#' | ||
#' @param textColumns Either character vector (e.g. data$Title) or a \code{cbind()} of columns to use for training the algorithms (e.g. \code{cbind(data$Title,data$Subject)}). | ||
#' @param language The language to be used for stemming the text data. | ||
#' @param minDocFreq The minimum number of times a word should appear in a document for it to be included in the matrix. See package \pkg{tm} for more details. | ||
#' @param minWordLength The minimum number of letters a word should contain to be included in the matrix. See package \pkg{tm} for more details. | ||
#' @param removeNumbers A \code{logical} parameter to specify whether to remove numbers. | ||
#' @param removePunctuation A \code{logical} parameter to specify whether to remove punctuation. | ||
#' @param removeSparseTerms See package \pkg{tm} for more details. | ||
#' @param removeStopwords A \code{logical} parameter to specify whether to remove stopwords using the language specified in language. | ||
#' @param stemWords A \code{logical} parameter to specify whether to stem words using the language specified in language. | ||
#' @param stripWhitespace A \code{logical} parameter to specify whether to strip whitespace. | ||
#' @param toLower A \code{logical} parameter to specify whether to make all text lowercase. | ||
#' @param weighting Either \code{weightTf} or \code{weightTfIdf}. See package \pkg{tm} for more details. | ||
#' | ||
#' @author Timothy P. Jurka <tpjurka@@ucdavis.edu> | ||
#' | ||
#' @examples | ||
#' # DEFINE THE DOCUMENTS | ||
#' | ||
#' documents <- c("I am very happy, excited, and optimistic.", | ||
#' "I am very scared, annoyed, and irritated.", | ||
#' "Iraq's political crisis entered its second week one step closer to the potential | ||
#' dissolution of the government, with a call for elections by a vital coalition partner | ||
#' and a suicide attack that extended the spate of violence that has followed the withdrawal | ||
#' of U.S. troops.", | ||
#' "With nightfall approaching, Los Angeles authorities are urging residents to keep their | ||
#' outdoor lights on as police and fire officials try to catch the person or people responsible | ||
#' for nearly 40 arson fires in the last three days.") | ||
#' matrix <- create_matrix(documents, language="english", removeNumbers=TRUE, | ||
#' stemWords=FALSE, weighting=weightTfIdf) | ||
#' | ||
create_matrix <- function(textColumns, language="english", minDocFreq=1, minWordLength=3, removeNumbers=TRUE, removePunctuation=TRUE, removeSparseTerms=0, removeStopwords=TRUE, stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE, weighting=weightTf) { | ||
|
||
stem_words <- function(x) { | ||
split <- strsplit(x," ") | ||
return(wordStem(split[[1]],language=language)) | ||
} | ||
|
||
control <- list(language=language,tolower=toLower,removeNumbers=removeNumbers,removePunctuation=removePunctuation,stripWhitespace=stripWhitespace,minWordLength=minWordLength,stopwords=removeStopwords,minDocFreq=minDocFreq,weighting=weighting) | ||
|
||
if (stemWords == TRUE) control <- append(control,list(stemming=stem_words),after=6) | ||
|
||
trainingColumn <- apply(as.matrix(textColumns),1,paste,collapse=" ") | ||
trainingColumn <- sapply(as.vector(trainingColumn,mode="character"),iconv,to="UTF8",sub="byte") | ||
|
||
corpus <- Corpus(VectorSource(trainingColumn),readerControl=list(language=language)) | ||
matrix <- DocumentTermMatrix(corpus,control=control); | ||
if (removeSparseTerms > 0) matrix <- removeSparseTerms(matrix,removeSparseTerms) | ||
|
||
gc() | ||
return(matrix) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
#' Function to remove accents from words | ||
#' | ||
#' \code{rm_accent} Function to remove accents from words | ||
#' | ||
#' @param str vector of strings that will have their accents removed. | ||
#' @param pattern string vector with one or more elements indicating which accents should be removed. To indicate which accents should be removed, a vector with the symbols should be passed. Example: pattern = c("´", "^") will strip out the acute and caret accents only. Other accepted words: "all" (remove all accents, which are "´", "`", "^", "~", "¨", "ç") | ||
#' | ||
#' @return vector without accent | ||
#' | ||
#' @examples | ||
#' x <- "São Paulo" | ||
#' | ||
#' y <- rm_accent(x, pattern = "'") | ||
#' print(y) | ||
#' | ||
#' y <- rm_accent(x, pattern = "~") | ||
#' print(y) | ||
#' | ||
#' y <- rm_accent(x, pattern = "all") | ||
#' print(y) | ||
#' | ||
rm_accent <- function(str, pattern = "all") { | ||
|
||
if(!is.character(str)) | ||
str <- as.character(str) | ||
|
||
pattern <- unique(pattern) | ||
|
||
if(any(pattern == stringi::stri_unescape_unicode("\\u00c7") )) | ||
pattern[pattern == stringi::stri_unescape_unicode("\\u00c7") ] <- stringi::stri_unescape_unicode("\\u00e7") | ||
|
||
symbols <- c( | ||
acute = "\\u00e1\\u00e9\\u00ed\\u00f3\\u00fa\\u00c1\\u00c9\\u00cd\\u00d3\\u00da\\u00fd\\u00dd", | ||
grave = "\\u00e0\\u00e8\\u00ec\\u00f2\\u00f9\\u00c0\\u00c8\\u00cc\\u00d2\\u00d9", | ||
circunflex = "\\u00e2\\u00ea\\u00ee\\u00f4\\u00fb\\u00c2\\u00ca\\u00ce\\u00d4\\u00db", | ||
tilde = "\\u00e3\\u00f5\\u00c3\\u00d5\\u00f1\\u00d1", | ||
umlaut = "\\u00e4\\u00eb\\u00ef\\u00f6\\u00fc\\u00c4\\u00cb\\u00cf\\u00d6\\u00dc\\u00ff", | ||
cedil = "\\u00e7\\u00c7" | ||
) | ||
|
||
nudeSymbols <- c( | ||
acute = "aeiouAEIOUyY", | ||
grave = "aeiouAEIOU", | ||
circunflex = "aeiouAEIOU", | ||
tilde = "aoAOnN", | ||
umlaut = "aeiouAEIOUy", | ||
cedil = "cC" | ||
) | ||
|
||
accentTypes <- c(stringi::stri_unescape_unicode("\\u00b4"), | ||
"`","^","~", | ||
stringi::stri_unescape_unicode("\\u00a8"), | ||
stringi::stri_unescape_unicode("\\u00e7")) | ||
|
||
if(any(c("all","al","a","todos","t","to","tod","todo")%in%pattern)) | ||
return(chartr(paste( | ||
stringi::stri_unescape_unicode(symbols), | ||
collapse=""), paste(nudeSymbols, collapse=""), str)) | ||
|
||
for(i in which(accentTypes%in%pattern)) | ||
str <- chartr( | ||
stringi::stri_unescape_unicode(symbols[i]), | ||
nudeSymbols[i], str) | ||
|
||
return(str) | ||
} |
Oops, something went wrong.