report_example.Rmd

---
title: "ASReview report"
output:
  html_document:
    toc: true
    toc_depth: 3
    number_sections: true
    theme: united
    toc_float: true
---

This report is an example of the output from an [open source tool](https://github.com/langtonhugh/asreview_irr) which makes it easier to compare the decisions by two raters after screening literature samples in [ASReview](https://asreview.nl/). The report can be created automatically using the .csv files downloaded directly from the ASReview dashboard. The report is a work-in-progress. If you are an R user, please take a look at the [source code](https://github.com/langtonhugh/asreview_irr/blob/main/report_example.Rmd). Checks, feedback and suggestions for additional features are welcome. You can find my contact details on [my website](https://www.samlangton.info/).

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = F, warning = F, message = F)
# Load packages.
library(readr)
library(dplyr)
library(stringr)
library(rlang)
library(purrr)
library(irr)
library(irrCAC)
library(tools)
library(glue)
library(readxl)
library(utils)
library(tidyr, include.only = "replace_na")
library(kableExtra)
library(tcltk)
library(waffle)
```

```{r choose_your_files!, error=TRUE, message=FALSE}
# Load data.
file_1 <- choose.files(caption = "Select the .csv file for Rater A")
file_type1 <- file_ext(file_1)

tryCatch(
{
  if (file_type1 == "xlsx") {
    first_df <- read_excel(file_1)
  } else if (file_type1 == "csv") {
    first_df <- read_csv(file_1)
  } else {
    warning("File type not supported, try an Excel or a csv file")
  }
},
error = function(e) {
  warning("Unable to read the file Check if your file is valid")
  stop(e)
}) 

if(nrow(first_df) == 0) {
  stop("CSV is empty. Check if your file is a valid csv.")
}

file_2 <- choose.files(caption = "Select the .csv file for Rater B")
file_type2 <- file_ext(file_2)

tryCatch(
{
  if (file_type2 == "xlsx") {
    secon_df <- read_excel(file_2)
  } else if (file_type2 == "csv") {
    secon_df <- read_csv(file_2)
  } else {
    warning("File type not supported, try an Excel or a CSV file")
  }
},
error = function(e) {
  warning("Unable to read the second csv. Check if your file is a valid csv.")
  stop(e)
})

if(nrow(secon_df) == 0) {
  stop("CSV is empty. Check if your file is a valid csv.")
}


```


```{r validate}
if(nrow(first_df) != nrow(secon_df)) {
  stop("Size of the compared dataframes does not match. Check for issues in the loaded files (e.g., duplicate rows).")
}

validate_name <- function(potential_names, name, df) {
  for (string in potential_names) {
    if (string %in% names(df)) {
      return(string)
    }
  }
  return(tk_select.list(names(df), title = glue("Please select the variable that contains the {name} for {deparse(substitute(df))}")))
}

id_name_1 <- validate_name('record_id', 'record id', first_df)
id_name_2 <- validate_name('record_id', 'record id', secon_df)

relevance_1 <- validate_name('included', 'ASReview relevance rating', first_df)
relevance_2 <- validate_name('included', 'ASReview relevance rating', secon_df)

first_relevance <- first_df[[relevance_1]]
secon_relevance <- secon_df[[relevance_2]]

authors_1 <- validate_name(c('first_authors', 'authors'), 'author names', first_df)
authors_2 <- validate_name(c('first_authors', 'authors'), 'author names', secon_df)

year_1 <- validate_name(c('publication_year', 'year'), 'publication year', first_df)
year_2 <- validate_name(c('publication_year', 'year'), 'publication year', secon_df)

title_1 <- validate_name(c('primary_title', 'title'), 'title', first_df)
title_2 <- validate_name(c('primary_title', 'title'), 'title', secon_df)

abstract_1 <- validate_name(c('notes_abstract', 'abstract'), 'abstract', first_df)
abstract_2 <- validate_name(c('notes_abstract', 'abstract'), 'abstract', secon_df)

# Remove rows with duplicated ID's 
first_df <- first_df[!duplicated(first_df[[id_name_1]]), ]
secon_df <- secon_df[!duplicated(secon_df[[id_name_2]]), ]

```

# Files used

File 1 (selected for rater A):

```{r, comment = ""}
cat(file_1)
```

File 2 (selected for rater B):

```{r, comment = ""}
cat(file_2)
```

```{r calc}
# Relevant flags.
first_rele <- table(first_relevance)[["1"]]
first_irre <- table(first_relevance)[["0"]]

# Irrelevant flags.
secon_rele <- table(secon_relevance)[["1"]]
secon_irre <- table(secon_relevance)[["0"]]

# Total reviewed.
first_total <- first_rele + first_irre
secon_total <- secon_rele + secon_irre

# Not reviewed
first_unrev <- sum(is.na(first_relevance))
secon_unrev <- sum(is.na(secon_relevance))

# % reviewed.
first_prev <- round(100*first_total/nrow(first_df), 2)
secon_prev <- round(100*secon_total/nrow(secon_df), 2)

# Total 'irrelevant' (if we define un-reviewed as irrelevant)
first_trele <- first_irre + first_unrev
secon_trele <- secon_irre + secon_unrev

# % total relevant out of all articles uploaded to asreview.
first_ptrel <- round(100*first_rele/nrow(first_df), 2)
secon_ptrel <- round(100*secon_rele/nrow(secon_df), 2)

# % flagged relevant (out of those reviewed).
first_pfrel <- round(100*first_rele/first_total, 2)
secon_pfrel <- round(100*secon_rele/secon_total, 2)

# Disagreements 1: first coder says relevant, second coder says irrelevant.
first_relevant_vec <- first_df %>%
  filter(.data[[relevance_1]] == 1) %>%
  pluck(paste(id_name_1))

disagree1_df <- secon_df %>%
  filter(.data[[relevance_2]] == 0, .data[[id_name_2]] %in% first_relevant_vec) %>%
  select(paste(id_name_2), paste(authors_2), paste(year_2), paste(title_2), paste(abstract_2)) # For older projects.

# Define how many disagreements for table.
first_dis <- nrow(disagree1_df) # 39

# Disagreement 2: second coder says relevant, first coder says irrelevant, 
secon_relevant_vec <- secon_df %>% 
  filter(.data[[relevance_2]] == 1) %>% 
  pluck(paste(id_name_2))

# Subset first rater's irrelevant list with those second rater said relevant.
disagree2_df <- first_df %>% 
  filter(.data[[relevance_1]] == 0, .data[[id_name_1]] %in% secon_relevant_vec) %>% 
  select(paste(id_name_1), paste(authors_1), paste(year_1), paste(title_1), paste(abstract_1)) # For older projects.

# Define how many disagreements for table.
secon_dis <- nrow(disagree2_df) # 5

# Sort and create vectors for Kappa.
first_vec <- first_df %>% 
  arrange(!!sym(id_name_1)) %>% 
  pluck(paste(relevance_1))

secon_vec <- secon_df %>% 
  arrange(!!sym(id_name_2)) %>% 
  pluck(paste(relevance_2))

# Create array for Kappa. Note this still contains missings for those that 
# one or both raters did not review. These are automatically removed in the stat.
# Not that the actual Kappa test is run within the script later.
full_array <- cbind(first_vec, secon_vec)

# Further disagreement statistics.

# Identify left unreviewed by rater 1.
first_dist_vec <- first_df %>% 
  filter(is.na(.data[[relevance_1]])) %>%
  pluck(paste(id_name_1))

# Check. This should be TRUE.
# length(first_dist_vec) == first_unrev

# Identify left unreviewed by rater 2.
secon_dist_vec <- secon_df %>% 
  filter(is.na(.data[[relevance_2]])) %>%
  pluck(paste(id_name_2))

first_dist_df <- first_df %>% 
  filter(.data[[relevance_1]] == 1) %>% 
  mutate(missed_flag = if_else(paste(id_name_1) %in% secon_dist_vec,
                               "missed",
                               "not missed"),
         missed_flag = as.factor(missed_flag))

# Define the factor levels so we get frequencies of zeros, if there.
first_dist_df$missed_flag_fac <- factor(first_dist_df$missed_flag,
                                        levels = c("not missed", "missed"))

# Frequency counts.
# table(first_dist_df$missed_flag_fac)

# Create flag for those identified as relevant by second rater, but not reviewed
# by second.
secon_dist_df <- secon_df %>% 
  filter(.data[[relevance_2]] == 1) %>% 
  mutate(missed_flag = if_else(.data[[id_name_2]] %in% first_dist_vec,
                          "missed",
                          "not missed"),
         missed_flag = as.factor(missed_flag))

# Define the factor levels so we get frequencies of zeros, if there.
secon_dist_df$missed_flag_fac <- factor(secon_dist_df$missed_flag,
                                        levels = c("not missed", "missed"))

# Frequency counts.
# table(secon_dist_df$missed_flag_fac)

# Create vectors with numbers of missed for the table.
missed1 <- table(first_dist_df$missed_flag_fac)[["missed"]]
missed2 <- table(secon_dist_df$missed_flag_fac)[["missed"]]
```

```{r irrdatasave}
# This is just to save a nice table summary for a paper, not for the report.
irr_df <- tibble(
  ` ` = c("n uploaded", 
                "n reviewed",
                "% reviewed",
                "n flagged relevant",
                "n flagged irrelevant",
                "% flagged relevant",
                "n unreviewed",
                "n irrelevant + unreviewed",
                "% irrelevant + unreviewed",
                "n relevant v. irrelevant",
                "n relevant v. unreviewed"),
  `rater A` = c(nrow(first_df),
                first_total,
                first_prev,
                first_rele,
                first_irre,
                first_pfrel,
                first_unrev,
                first_trele,
                first_ptrel,
                first_dis,
                missed1),
  `rater B` = c(nrow(secon_df),
                secon_total,
                secon_prev,
                secon_rele,
                secon_irre,
                secon_pfrel,
                secon_unrev,
                secon_trele,
                secon_ptrel,
                secon_dis,
                missed2)
  )

write_csv(x = irr_df, file = "data/irr_table.csv")

```

# Rater summary


|                            |      rater A       |       rater B      |
|----------------------------|--------------------|--------------------|
| n uploaded                 | `r nrow(first_df)` | `r nrow(secon_df)` |
| n reviewed                 |  `r first_total`   |  `r secon_total`   |
| \% reviewed                |   `r first_prev`   |   `r secon_prev`   |
| n flagged relevant         |   `r first_rele`   |   `r secon_rele`   |
| n flagged irrelevant       |   `r first_irre`   |   `r secon_irre`   |
| \% flagged relevant        |   `r first_pfrel`  |  `r secon_pfrel`   |
| n unreviewed               |  `r first_unrev`   |  `r secon_unrev`   |
| n unreviewed +  irrelevant |  `r first_trele`   |  `r secon_trele`   |
| \% total relevant          |  `r first_ptrel`   |  `r secon_ptrel`   |
| n relevant v. irrelevant   |   `r first_dis`    |   `r secon_dis`    |
| n relevant v. unreviewed   |    `r missed1`     |    `r missed2`     |


# Overall summary

```{r additionaltable}
# Remove the notes col
secon_df <- secon_df %>% 
  # select(-exported_notes_1) %>%      # add if needed.
  mutate(rater_id = 2) 
  # rename(record_id = `ï..record_id`) # add if needed.

# Create rater id for the first rater.
first_df <- first_df %>% 
  mutate(rater_id = 1)

colnames(secon_df)[colnames(secon_df) == paste(id_name_2)] <- id_name_1
colnames(secon_df)[colnames(secon_df) == paste(relevance_2)] <- relevance_1
colnames(secon_df)[colnames(secon_df) == paste(authors_2)] <- authors_1
colnames(secon_df)[colnames(secon_df) == paste(title_2)] <- title_1
colnames(secon_df)[colnames(secon_df) == paste(abstract_2)] <- abstract_1
colnames(secon_df)[colnames(secon_df) == paste(year_2)] <- year_1

first_df[[paste(relevance_1)]] <- as.numeric(first_df[[paste(relevance_1)]])
secon_df[[paste(relevance_1)]] <- as.numeric(secon_df[[paste(relevance_1)]])


# Bind together.
first_secon_review_df <- bind_rows(first_df, secon_df) %>% 
  mutate(included_new := if_else(is.na(!!sym(relevance_1)), 1000, !!sym(relevance_1)))

# Create summary table.
compare_df <- first_secon_review_df %>% 
  # group_by(record_id, title) %>%       # add if needed. 
  group_by(!!sym(id_name_1), !!sym(authors_1), !!sym(year_1), !!sym(title_1), !!sym(abstract_1)) %>% # comment out if needed.
  summarise(agreement_number = sum(included_new)) %>% 
  ungroup()

# Summary table.
table2_summary_df <- count(compare_df, agreement_number) %>%
  mutate(stat = ifelse(agreement_number == 0   , "Both rated irrelevant"                      , NA),
         stat = ifelse(agreement_number == 1   , "One rated relevant, other irrelevant"       , stat),
         stat = ifelse(agreement_number == 2   , "Both rated relevant"                        , stat),
         stat = ifelse(agreement_number == 1000, "One rated irrelevant, other left unreviewed", stat),
         stat = ifelse(agreement_number == 1001, "One rated relevant, other left unreviewed"  , stat),
         stat = ifelse(agreement_number == 2000, "Both left unreviewed"                       , stat)) %>% 
  select(stat, n) %>% 
  rename(Description = stat)

both_raters_df <- table2_summary_df %>% 
  filter(Description == "Both rated irrelevant" |
         Description == "One rated relevant, other irrelevant" |
         Description == "Both rated relevant") %>% 
  # group_by(Description) %>% 
  summarize(Description = "Both raters made decision",
            n = sum(n))


table2_summary_complete_df <- table2_summary_df %>% 
  bind_rows(both_raters_df)

kable(table2_summary_complete_df)
```

# Written explanations

-   Reviewer A evaluated `r first_total` of `r nrow(first_df)` articles (`r first_prev`%), of which (s)he flagged `r first_rele` as relevant and `r first_irre` as irrelevant.
-   Reviewer B evaluated `r secon_total` of `r nrow(secon_df)` articles (`r secon_prev`%), of which (s)he flagged `r secon_rele` as relevant and `r secon_irre` as irrelevant.
-   Rater A left a total of `r first_unrev` articles unreviewed, while rater B left `r secon_unrev` unreviewed. In other words, these articles lay beyond the 'stop rule' and were never seen by the raters.
-   Rater A flagged `r missed1` article(s) as relevant that rater B left unreviewed. This figure for Rater B was `r missed2`.
-   The 'total irrelevant' figure is the sum of (1) the number of articles actively flagged as irrelevant by raters and (2) the number of articles left unreviewed after the stop rule (and therefore assumed to be irrelevant).
-   Rater A flagged `r first_dis` articles as relevant that rater B flagged as irrelevant. Conversely, rater B flagged `r secon_dis` articles as relevant that rater B flagged as irrelevant.
-   Here, the number of decisions eligible for the Kappa statistic is defined as the *both raters made decision* figure. This is the sum of *both rated irrelevant*, *one rated relevant, other irrelevant* and *both rated relevant*. By default, then, the Kappa reported here does not include abstracts for which *one rated relevant, other left unreviewed*.

# Visual summary

## Proportional breakdown

<center>

```{r, propbreakdown}
first_secon_review_df %>% 
  group_by(rater_id, included_new) %>% 
  summarize(freq = n()) %>%
  mutate(rater_id     = if_else(rater_id == 1, "Rater A", "Rater B"),
         included_var = recode(included_new,
                               `0`    = "Irrelevant",
                               `1`    = "Relevant"  ,
                               `1000` = "Unreviewed")) %>% 
  ggplot(data = .) +
  geom_waffle(mapping = aes(fill = included_var, values = freq),
              color = "white", make_proportional = TRUE, n_rows = 5,
              alpha = 0.7, 
              size = 0.5) +
  facet_wrap(~rater_id, ncol = 1) +
  theme_void() +
  theme(axis.text  = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "bottom",
        plot.title = element_text(hjust = 0.5, size = 12),
        legend.text = element_text(size = 9)) +
  labs(fill = NULL,
       title = "1 square = 1% of sample \n") +
  scale_fill_brewer(palette = "Set1") +
  coord_fixed(ratio = 1)
```

</center>

## Rater overlap

<center>

```{r, overlap}


 count(compare_df, agreement_number) %>%
  mutate(stat = ifelse(agreement_number == 0   , "Both rated irrelevant"                      , NA),
         stat = ifelse(agreement_number == 1   , "One rated relevant, other irrelevant"       , stat),
         stat = ifelse(agreement_number == 2   , "Both rated relevant"                        , stat),
         stat = ifelse(agreement_number == 1000, "One rated irrelevant, other left unreviewed", stat),
         stat = ifelse(agreement_number == 1001, "One rated relevant, other left unreviewed"  , stat),
         stat = ifelse(agreement_number == 2000, "Both left unreviewed"                       , stat)) %>%
  select(stat, n) %>%
  rename(Description = stat) %>%
  ggplot(data = .) +
  geom_waffle(mapping = aes(fill = Description, values = n),
              make_proportional = FALSE,
              n_rows = 60,
              size = 0.5,
              alpha = 0.9,
              colour = "white",
              flip = TRUE)  +
  theme_void() +
  labs(fill = NULL, title = "1 square = 1 article") +
  scale_fill_brewer(palette = "Set2") +
  theme(legend.position = "right",
        axis.ticks = element_blank(),
        axis.text  = element_blank(),
        plot.title = element_text(hjust = 0.5,size = 12),
        legend.text = element_text(size = 9)) +
  coord_fixed(ratio = 1)
```

</center>

# Comparison of different IRR methods

Here we present different Kappa-methods to calculate the IRR.

CAUTION! We are still actively researching what method is best to calculate the inter-rater reliability. It is difficult to find out what the optimal way to calculate the IRR, because the unrated items of both raters (the items that ASReview deems less relevant), depend on what you rated beforehand. This is not the kind of situation that most IRR-statistics are equipped to handle. If they are equipped to deal with unrated items, it is assumed that the missing data is random.

For now, we recommend using Gwet's kappa using listwise deletion. Note that this recommendation may change in the coming time!

```{r}
# Create NAs coded and listwise deletions lists.
na_as_cat <- replace_na(full_array, -1)
complete_cases <- full_array[complete.cases(full_array),]
```

The level of agreement between rater A and rater B is the following:

```{r}
agree(full_array)
```

## Cohen's kappa statistic (with missings coded as `-1`):

Cohen's kappa is used to measure the agreement between raters, correcting for chance alone. In this version, the missings are coded, which means that they are counted among the data. A downside of this method is that it can artificially inflate agreement rates because it treats missing values the same as observed values. It skews the results by giving undue weight to unrated items as it treats them the same as the observed values.

```{r}
kappa2(na_as_cat)
```

## Cohen's kappa statistic (with listwise deletion):

Cohen's kappa is used to measure the agreement between raters, correcting for chance alone. In this version, cases with missing values are excluded from the analysis. By only considering cases where both raters provided data, it provides a cleaner estimate of agreement between raters. This can reduce bias introduced by missing data and may provide a more accurate representation of true agreement.

```{r}

kappa2(complete_cases)
```

## Gwet's coefficient (with missings coded as `-1`)

Gwet's Kappa incorporates a prevalence adjustment. By weighting agreement based on the prevalence of the categories being rated, it aims to provide a more balanced measure of agreement. The increased complexity means that it can be more difficult to interpret. In this version, the missings are coded, which means that they are counted among the data. A downside of this method is that it can artificially inflate agreement rates because it treats missing values the same as observed values. It skews the results by giving undue weight to unrated items as it treats them the same as the observed values.


```{r}
gwet.ac1.raw(na_as_cat)
```

## Gwet's coefficient (with listwise deletion):

Gwet's Kappa incorporates a prevalence adjustment. By weighting agreement based on the prevalence of the categories being rated, it can result in a more balanced measure of agreement. By only considering cases where both raters provided data, it provides a cleaner estimate of agreement between raters. This can reduce bias introduced by missing data and may provide a more accurate representation of true agreement.

```{r}
gwet.ac1.raw(complete_cases)
```

## Krippendorff's alpha (with listwise deletion):

Krippendorff's alpha is a generalization of Cohen's kappa that can handle multiple raters. It has no prevalance correction (see Gwet's kappa).

```{r}
kripp.alpha(t(full_array))
```

# Disagreements

## Relevant versus unreviewed

If there are any, the following are the articles for which one rater flagged 'relevant' and the other did not review it because it lay beyond the stop rule.

```{r}
if (missed1 == 0 & missed2 == 0) {
  print("There were no articles for which one rater flagged 'relevant' and the other did not review it.")
} else {
  compare_df %>% 
    filter(agreement_number == 1001) %>%
    select(-agreement_number) %>% 
    mutate(publication_year = str_extract_all(publication_year, "^.{4}")) %>% 
    kable(col.names = c("id", "author(s)", "year", "title", "abstract")) %>%
  kable_styling()
  
}
```

## Rater A

Studies for which rater A flagged relevant and rater B flagged irrelevant.

```{r}
# Here, note that ASReview updated their default column names late-2021. 
# If your project is newer (late-2021 onward) then comment out the mutate() line.
disagree1_df %>% 
  mutate(publication_year = str_extract_all(publication_year, "^.{4}")) %>% 
  kable(col.names = c("id", "author(s)", "year", "title", "abstract")) %>%
  kable_styling() 
  # scroll_box(width = "120%", height = "300px") %>% 
```

<br>

<br>

## Rater B

Studies for which rater B flagged relevant and rater A flagged irrelevant.

```{r}
# Here, note that ASReview updated their default column names late-2021. 
# If your project is newer (late-2021 onward) then comment out the mutate() line.

disagree2_df %>% 
  mutate(publication_year = str_extract_all(publication_year, "^.{4}")) %>% 
  kable(col.names = c("id", "author(s)", "year", "title", "abstract")) %>% 
  # scroll_box(width = "120%", height = "300px") %>%
  kable_styling()
```