From 1783a63ef2cb8d5830470e9224e66af4597e3c94 Mon Sep 17 00:00:00 2001 From: Ernest Guevarra Date: Sat, 10 Aug 2024 20:14:48 +0100 Subject: [PATCH] process 2022 climate data --- R/pagasa_climate.R | 183 +++------------------------------------ R/pagasa_climate_utils.R | 44 ++++++++-- 2 files changed, 47 insertions(+), 180 deletions(-) diff --git a/R/pagasa_climate.R b/R/pagasa_climate.R index 92d543a..93dda10 100644 --- a/R/pagasa_climate.R +++ b/R/pagasa_climate.R @@ -1000,131 +1000,9 @@ climate_process_2021 <- function(climate_download_files) { pdf_path <- climate_download_files |> (\(x) x[stringr::str_detect(string = x, pattern = "2021")])() - pdfs <- lapply( - X = pdf_path, - FUN = function(x) { - pdftools::pdf_text(x) |> - stringr::str_split(pattern = "\n") |> - unlist() - } - ) - ## Get station information ---- - station <- lapply( - X = pdfs, - FUN = function(x) { - x[stringr::str_detect(string = x, pattern = "STATION")] |> - stringr::str_split(pattern = "\\s{2,100}", simplify = TRUE) |> - (\(x) x[ , 1])() |> - stringr::str_remove_all(pattern = "STATION: ") |> - stringr::str_to_title() |> - stringr::str_replace_all(pattern = "Cubi Pt.", replacement = "Cubi Point") |> - stringr::str_replace_all(pattern = "Naia", replacement = "NAIA") |> - stringr::str_replace_all(pattern = "Mia", replacement = "MIA") |> - stringr::str_replace_all(pattern = "Mco", replacement = "MCO") |> - stringr::str_replace_all(pattern = "Del", replacement = "del") |> - stringr::str_replace_all(pattern = "Former Vigan Station", replacement = "former Vigan Station") |> - stringr::str_replace_all(pattern = "Synop", replacement = "SYNOP") - } - ) |> - unlist() - - ## Get period information ---- - period <- lapply( - X = pdfs, - FUN = function(x) { - x[stringr::str_detect(string = x, pattern = "YEAR:")] |> - stringr::str_split(pattern = "\\s{2,100}", simplify = TRUE) |> - (\(x) x[ , 1])() |> - stringr::str_remove_all(pattern = "YEAR: ") |> - stringr::str_replace_all(pattern = " - ", replacement = "-") |> - stringr::str_to_title() - } - ) |> - unlist() - - ## Get station latitude information ---- - latitude <- lapply( - X = pdfs, - FUN = function(x) { - x[stringr::str_detect(string = x, pattern = "LATITUDE")] |> - stringr::str_split(pattern = "\\s{2,100}", simplify = TRUE) |> - (\(x) x[ , 2])() |> - stringr::str_replace_all(pattern = "14.76N", replacement = "14.76\"N") |> - stringr::str_replace_all(pattern = "13.20N", replacement = "13.20\"N") |> - stringr::str_remove_all("LATITUDE: ") |> - stringr::str_split(pattern = "o|°|'|\"", simplify = TRUE) |> - (\(x) x[x != ""])() |> - rbind() |> - data.frame() |> - (\(x) - { - names(x) <- c("degrees", "minutes", "seconds", "direction") - x - } - )() - } - ) |> - dplyr::bind_rows() |> - dplyr::mutate( - degrees = as.numeric(degrees), - minutes = as.numeric(minutes), - seconds = as.numeric(seconds), - latitude = degrees + (minutes / 60) + (seconds / 3600) - ) |> - dplyr::pull(latitude) - - ## Get station longitude information ---- - longitude <- lapply( - X = pdfs, - FUN = function(x) { - x[stringr::str_detect(string = x, pattern = "LONGITUDE:")] |> - stringr::str_split(pattern = "\\s{2,100}", simplify = TRUE) |> - (\(x) x[ , 2])() |> - stringr::str_replace_all(pattern = "56.76E", replacement = "56.76\"E") |> - stringr::str_replace_all(pattern = "08.10E", replacement = "0.810\"E") |> - stringr::str_replace_all(pattern = "57.53E", replacement = "14.76\"E") |> - stringr::str_remove_all("LONGITUDE: ") |> - stringr::str_split(pattern = "o|°|'|\"", simplify = TRUE) |> - (\(x) x[x != ""])() |> - rbind() |> - data.frame() |> - (\(x) - { - names(x) <- c("degrees", "minutes", "seconds", "direction") - x - } - )() - } - ) |> - dplyr::bind_rows() |> - dplyr::mutate( - degrees = as.numeric(degrees), - minutes = as.numeric(minutes), - seconds = as.numeric(seconds), - longitude = degrees + (minutes / 60) + (seconds / 3600) - ) |> - dplyr::pull(longitude) - - ## Get station elevation information ---- - elevation <- lapply( - X = pdfs, - FUN = function(x) { - x[stringr::str_detect(string = x, pattern = "ELEVATION:")] |> - stringr::str_split(pattern = "\\s{2,1000}", simplify = TRUE) |> - (\(x) x[x != ""])() |> - stringr::str_remove_all(pattern = "ELEVATION: | m|m") |> - as.numeric() - } - ) |> - unlist() - - ## Create station information data.frame ---- - station_df <- tibble::tibble( - station = station, - latitude = latitude, - longitude = longitude, - elevation = elevation + station_df <- get_weather_station_info( + climate_download_files, period = "2021" ) ## Read text data from tables from each PDF ---- @@ -1148,14 +1026,14 @@ climate_process_2021 <- function(climate_download_files) { ) |> (\(x) { - names(x) <- paste0(station, "_", period) + names(x) <- paste0(station_df$station, "_", station_df$period) x } )() ## Process each PDF table with multiple rows per month ---- pdfs[[1]] <- pdfs[[1]] |> - structure_table_double(row_index = 9:11, col_index = 11:12) |> + structure_table_double(row_index = 9:11, col_index = 13:14) |> remove_table_rows() pdfs[[2]] <- pdfs[[2]] |> @@ -1474,47 +1352,7 @@ climate_process_2021 <- function(climate_download_files) { remove_table_rows() ## Final processing/structuring/concatenating of climate data ---- - lapply( - X = pdfs, - FUN = function(x) { - data.frame(x) |> - dplyr::rename_with( - .fn = function(x) - c("time_name", - "temperature_max", "temperature_max_date", - "temperature_min", "temperature_min_date", - "rainfall_max", "rainfall_max_date", - "windspeed_max", "windspeed_max_direction", "windspeed_max_date", - "sea_level_pressure_max", "sea_level_pressure_max_date", - "sea_level_pressure_min", "sea_level_pressure_min_date") - ) |> - dplyr::mutate( - time_name = dplyr::case_when( - time_name == "JULY" ~ "JUL", - time_name == "JUNE" ~ "JUN", - .default = time_name - ) |> - stringr::str_to_title() - ) - } - ) |> - dplyr::bind_rows(.id = "station_period") |> - dplyr::mutate( - station = stringr::str_extract_all( - string = station_period, pattern = "^[^_]*" - ) |> - unlist(), - time_period = stringr::str_remove_all( - string = station_period, pattern = "^[^_]*|_" - ), - .after = station_period - ) |> - dplyr::select(-station_period) |> - dplyr::mutate( - time_unit = ifelse(time_name %in% month.abb, "month", "year"), - .after = time_name - ) |> - tibble::tibble() + structure_climate_data(pdfs) } @@ -1524,14 +1362,15 @@ climate_process_2021 <- function(climate_download_files) { #' climate_process_2022 <- function(climate_download_files) { - station_df <- get_weather_station_info( - climate_download_files, period = "2022" - ) - ## Get vector of file paths for 2021 PDFS ---- pdf_path <- climate_download_files |> (\(x) x[stringr::str_detect(string = x, pattern = "2022")])() + ## Get weather station information ---- + station_df <- get_weather_station_info( + climate_download_files, period = "2022" + ) + ## Read text data from tables from each PDF ---- pdfs <- lapply( X = pdf_path, @@ -1560,7 +1399,7 @@ climate_process_2022 <- function(climate_download_files) { ## Process each PDF table with multiple rows per month ---- pdfs[[1]] <- pdfs[[1]] |> - structure_table_double(row_index = 9:11, col_index = 11:12) |> + structure_table_double(row_index = 9:11, col_index = 13:14) |> remove_table_rows() pdfs[[2]] <- pdfs[[2]] |> diff --git a/R/pagasa_climate_utils.R b/R/pagasa_climate_utils.R index 9a0116d..854800e 100644 --- a/R/pagasa_climate_utils.R +++ b/R/pagasa_climate_utils.R @@ -1,6 +1,20 @@ #' #' Helper function/s to process climate tables +#' +#' @param pdf A text matrix from climate PDF +#' @param row_index A vector of row number indices to structure +#' @param col_index A vector of column number indices to structure #' +#' @returns A text matrix from climate PDF +#' +#' @examples +#' structure_table_single(pdf, row_index = 1:3, col_index = 2:3) +#' structure_table_double(pdf, row_index = 1:3, col_index = 2:3) +#' structure_table_triple(pdf, row_index = 1:4, col_index = 2:3) +#' structure_table_quadruple(pdf, row_index = 1:5, col_index = 2:3) +#' +#' @rdname structure_table +#' @export #' structure_table_single <- function(pdf, row_index, col_index) { @@ -15,6 +29,10 @@ structure_table_single <- function(pdf, row_index, col_index) { )() } +#' +#' @rdname structure_table +#' @export +#' structure_table_double <- function(pdf, row_index, col_index) { pdf |> @@ -30,6 +48,10 @@ structure_table_double <- function(pdf, row_index, col_index) { )() } +#' +#' @rdname structure_table +#' @export +#' structure_table_triple <- function(pdf, row_index, col_index) { pdf |> @@ -49,29 +71,37 @@ structure_table_triple <- function(pdf, row_index, col_index) { )() } +#' +#' @rdname structure_table +#' @export +#' structure_table_quadruple <- function(pdf, row_index, col_index) { pdf |> (\(x) { - x[row_index[1], 2:14] <- x[row_index[1], 1:13] - x[row_index[1], 1] <- x[row_index[3], 1] + x[row_index[1], 2:14] <- x[row_index[1], 1:13] + x[row_index[1], 1] <- x[row_index[3], 1] x[row_index[2], col_index] <- x[row_index[2], ][x[row_index[2], ] != ""] x[row_index[2], 1:(col_index[1] - 1)] <- x[row_index[1], 1:(col_index[1] - 1)] x[row_index[2], (col_index[2] + 1):14] <- x[row_index[1], (col_index[2] + 1):14] - x[row_index[3], 2:14] <- x[row_index[1], 2:14] - x[row_index[3], col_index] <- x[row_index[4], ][x[row_index[4], ] != ""] + x[row_index[3], 2:14] <- x[row_index[1], 2:14] + x[row_index[3], col_index] <- x[row_index[4], ][x[row_index[4], ] != ""] - x[row_index[4], ] <- x[row_index[1], ] - x[row_index[4], col_index] <- x[row_index[5], ][x[row_index[5], ] != ""] + x[row_index[4], ] <- x[row_index[1], ] + x[row_index[4], col_index] <- x[row_index[5], ][x[row_index[5], ] != ""] x } )() } +#' +#' Remove empty climate data table rows +#' + remove_table_rows <- function(pdf_tab) { stringr::str_detect(pdf_tab[ , 1], pattern = "[A-Z]") |> @@ -79,10 +109,8 @@ remove_table_rows <- function(pdf_tab) { } - #' #' Retrieve weather station information from set of climate PDF datasets -#' #' get_weather_station_info <- function(climate_download_files,