Skip to content

Commit

Permalink
process 2022 climate data
Browse files Browse the repository at this point in the history
  • Loading branch information
ernestguevarra committed Aug 10, 2024
1 parent 1b4320d commit 1783a63
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 180 deletions.
183 changes: 11 additions & 172 deletions R/pagasa_climate.R
Original file line number Diff line number Diff line change
Expand Up @@ -1000,131 +1000,9 @@ climate_process_2021 <- function(climate_download_files) {
pdf_path <- climate_download_files |>
(\(x) x[stringr::str_detect(string = x, pattern = "2021")])()

pdfs <- lapply(
X = pdf_path,
FUN = function(x) {
pdftools::pdf_text(x) |>
stringr::str_split(pattern = "\n") |>
unlist()
}
)

## Get station information ----
station <- lapply(
X = pdfs,
FUN = function(x) {
x[stringr::str_detect(string = x, pattern = "STATION")] |>
stringr::str_split(pattern = "\\s{2,100}", simplify = TRUE) |>
(\(x) x[ , 1])() |>
stringr::str_remove_all(pattern = "STATION: ") |>
stringr::str_to_title() |>
stringr::str_replace_all(pattern = "Cubi Pt.", replacement = "Cubi Point") |>
stringr::str_replace_all(pattern = "Naia", replacement = "NAIA") |>
stringr::str_replace_all(pattern = "Mia", replacement = "MIA") |>
stringr::str_replace_all(pattern = "Mco", replacement = "MCO") |>
stringr::str_replace_all(pattern = "Del", replacement = "del") |>
stringr::str_replace_all(pattern = "Former Vigan Station", replacement = "former Vigan Station") |>
stringr::str_replace_all(pattern = "Synop", replacement = "SYNOP")
}
) |>
unlist()

## Get period information ----
period <- lapply(
X = pdfs,
FUN = function(x) {
x[stringr::str_detect(string = x, pattern = "YEAR:")] |>
stringr::str_split(pattern = "\\s{2,100}", simplify = TRUE) |>
(\(x) x[ , 1])() |>
stringr::str_remove_all(pattern = "YEAR: ") |>
stringr::str_replace_all(pattern = " - ", replacement = "-") |>
stringr::str_to_title()
}
) |>
unlist()

## Get station latitude information ----
latitude <- lapply(
X = pdfs,
FUN = function(x) {
x[stringr::str_detect(string = x, pattern = "LATITUDE")] |>
stringr::str_split(pattern = "\\s{2,100}", simplify = TRUE) |>
(\(x) x[ , 2])() |>
stringr::str_replace_all(pattern = "14.76N", replacement = "14.76\"N") |>
stringr::str_replace_all(pattern = "13.20N", replacement = "13.20\"N") |>
stringr::str_remove_all("LATITUDE: ") |>
stringr::str_split(pattern = "o|°|'|\"", simplify = TRUE) |>
(\(x) x[x != ""])() |>
rbind() |>
data.frame() |>
(\(x)
{
names(x) <- c("degrees", "minutes", "seconds", "direction")
x
}
)()
}
) |>
dplyr::bind_rows() |>
dplyr::mutate(
degrees = as.numeric(degrees),
minutes = as.numeric(minutes),
seconds = as.numeric(seconds),
latitude = degrees + (minutes / 60) + (seconds / 3600)
) |>
dplyr::pull(latitude)

## Get station longitude information ----
longitude <- lapply(
X = pdfs,
FUN = function(x) {
x[stringr::str_detect(string = x, pattern = "LONGITUDE:")] |>
stringr::str_split(pattern = "\\s{2,100}", simplify = TRUE) |>
(\(x) x[ , 2])() |>
stringr::str_replace_all(pattern = "56.76E", replacement = "56.76\"E") |>
stringr::str_replace_all(pattern = "08.10E", replacement = "0.810\"E") |>
stringr::str_replace_all(pattern = "57.53E", replacement = "14.76\"E") |>
stringr::str_remove_all("LONGITUDE: ") |>
stringr::str_split(pattern = "o|°|'|\"", simplify = TRUE) |>
(\(x) x[x != ""])() |>
rbind() |>
data.frame() |>
(\(x)
{
names(x) <- c("degrees", "minutes", "seconds", "direction")
x
}
)()
}
) |>
dplyr::bind_rows() |>
dplyr::mutate(
degrees = as.numeric(degrees),
minutes = as.numeric(minutes),
seconds = as.numeric(seconds),
longitude = degrees + (minutes / 60) + (seconds / 3600)
) |>
dplyr::pull(longitude)

## Get station elevation information ----
elevation <- lapply(
X = pdfs,
FUN = function(x) {
x[stringr::str_detect(string = x, pattern = "ELEVATION:")] |>
stringr::str_split(pattern = "\\s{2,1000}", simplify = TRUE) |>
(\(x) x[x != ""])() |>
stringr::str_remove_all(pattern = "ELEVATION: | m|m") |>
as.numeric()
}
) |>
unlist()

## Create station information data.frame ----
station_df <- tibble::tibble(
station = station,
latitude = latitude,
longitude = longitude,
elevation = elevation
station_df <- get_weather_station_info(
climate_download_files, period = "2021"
)

## Read text data from tables from each PDF ----
Expand All @@ -1148,14 +1026,14 @@ climate_process_2021 <- function(climate_download_files) {
) |>
(\(x)
{
names(x) <- paste0(station, "_", period)
names(x) <- paste0(station_df$station, "_", station_df$period)
x
}
)()

## Process each PDF table with multiple rows per month ----
pdfs[[1]] <- pdfs[[1]] |>
structure_table_double(row_index = 9:11, col_index = 11:12) |>
structure_table_double(row_index = 9:11, col_index = 13:14) |>
remove_table_rows()

pdfs[[2]] <- pdfs[[2]] |>
Expand Down Expand Up @@ -1474,47 +1352,7 @@ climate_process_2021 <- function(climate_download_files) {
remove_table_rows()

## Final processing/structuring/concatenating of climate data ----
lapply(
X = pdfs,
FUN = function(x) {
data.frame(x) |>
dplyr::rename_with(
.fn = function(x)
c("time_name",
"temperature_max", "temperature_max_date",
"temperature_min", "temperature_min_date",
"rainfall_max", "rainfall_max_date",
"windspeed_max", "windspeed_max_direction", "windspeed_max_date",
"sea_level_pressure_max", "sea_level_pressure_max_date",
"sea_level_pressure_min", "sea_level_pressure_min_date")
) |>
dplyr::mutate(
time_name = dplyr::case_when(
time_name == "JULY" ~ "JUL",
time_name == "JUNE" ~ "JUN",
.default = time_name
) |>
stringr::str_to_title()
)
}
) |>
dplyr::bind_rows(.id = "station_period") |>
dplyr::mutate(
station = stringr::str_extract_all(
string = station_period, pattern = "^[^_]*"
) |>
unlist(),
time_period = stringr::str_remove_all(
string = station_period, pattern = "^[^_]*|_"
),
.after = station_period
) |>
dplyr::select(-station_period) |>
dplyr::mutate(
time_unit = ifelse(time_name %in% month.abb, "month", "year"),
.after = time_name
) |>
tibble::tibble()
structure_climate_data(pdfs)
}


Expand All @@ -1524,14 +1362,15 @@ climate_process_2021 <- function(climate_download_files) {
#'

climate_process_2022 <- function(climate_download_files) {
station_df <- get_weather_station_info(
climate_download_files, period = "2022"
)

## Get vector of file paths for 2021 PDFS ----
pdf_path <- climate_download_files |>
(\(x) x[stringr::str_detect(string = x, pattern = "2022")])()

## Get weather station information ----
station_df <- get_weather_station_info(
climate_download_files, period = "2022"
)

## Read text data from tables from each PDF ----
pdfs <- lapply(
X = pdf_path,
Expand Down Expand Up @@ -1560,7 +1399,7 @@ climate_process_2022 <- function(climate_download_files) {

## Process each PDF table with multiple rows per month ----
pdfs[[1]] <- pdfs[[1]] |>
structure_table_double(row_index = 9:11, col_index = 11:12) |>
structure_table_double(row_index = 9:11, col_index = 13:14) |>
remove_table_rows()

pdfs[[2]] <- pdfs[[2]] |>
Expand Down
44 changes: 36 additions & 8 deletions R/pagasa_climate_utils.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,20 @@
#'
#' Helper function/s to process climate tables
#'
#' @param pdf A text matrix from climate PDF
#' @param row_index A vector of row number indices to structure
#' @param col_index A vector of column number indices to structure
#'
#' @returns A text matrix from climate PDF
#'
#' @examples
#' structure_table_single(pdf, row_index = 1:3, col_index = 2:3)
#' structure_table_double(pdf, row_index = 1:3, col_index = 2:3)
#' structure_table_triple(pdf, row_index = 1:4, col_index = 2:3)
#' structure_table_quadruple(pdf, row_index = 1:5, col_index = 2:3)
#'
#' @rdname structure_table
#' @export
#'

structure_table_single <- function(pdf, row_index, col_index) {
Expand All @@ -15,6 +29,10 @@ structure_table_single <- function(pdf, row_index, col_index) {
)()
}

#'
#' @rdname structure_table
#' @export
#'

structure_table_double <- function(pdf, row_index, col_index) {
pdf |>
Expand All @@ -30,6 +48,10 @@ structure_table_double <- function(pdf, row_index, col_index) {
)()
}

#'
#' @rdname structure_table
#' @export
#'

structure_table_triple <- function(pdf, row_index, col_index) {
pdf |>
Expand All @@ -49,40 +71,46 @@ structure_table_triple <- function(pdf, row_index, col_index) {
)()
}

#'
#' @rdname structure_table
#' @export
#'

structure_table_quadruple <- function(pdf, row_index, col_index) {
pdf |>
(\(x)
{
x[row_index[1], 2:14] <- x[row_index[1], 1:13]
x[row_index[1], 1] <- x[row_index[3], 1]
x[row_index[1], 2:14] <- x[row_index[1], 1:13]
x[row_index[1], 1] <- x[row_index[3], 1]

x[row_index[2], col_index] <- x[row_index[2], ][x[row_index[2], ] != ""]
x[row_index[2], 1:(col_index[1] - 1)] <- x[row_index[1], 1:(col_index[1] - 1)]
x[row_index[2], (col_index[2] + 1):14] <- x[row_index[1], (col_index[2] + 1):14]

x[row_index[3], 2:14] <- x[row_index[1], 2:14]
x[row_index[3], col_index] <- x[row_index[4], ][x[row_index[4], ] != ""]
x[row_index[3], 2:14] <- x[row_index[1], 2:14]
x[row_index[3], col_index] <- x[row_index[4], ][x[row_index[4], ] != ""]

x[row_index[4], ] <- x[row_index[1], ]
x[row_index[4], col_index] <- x[row_index[5], ][x[row_index[5], ] != ""]
x[row_index[4], ] <- x[row_index[1], ]
x[row_index[4], col_index] <- x[row_index[5], ][x[row_index[5], ] != ""]

x
}
)()
}

#'
#' Remove empty climate data table rows
#'

remove_table_rows <- function(pdf_tab) {

stringr::str_detect(pdf_tab[ , 1], pattern = "[A-Z]") |>
(\(x) pdf_tab[x, ])()

}


#'
#' Retrieve weather station information from set of climate PDF datasets
#'
#'

get_weather_station_info <- function(climate_download_files,
Expand Down

0 comments on commit 1783a63

Please sign in to comment.