Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fuzzy match #40

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ S3method(vec_ptype_abbr,addr)
S3method(vec_ptype_full,addr)
export(addr)
export(addr_match)
export(addr_match2)
export(addr_match_geocode)
export(addr_match_line_one)
export(addr_match_street)
export(addr_match_street_name_and_number)
export(addr_match_tiger_street_ranges)
Expand Down
4 changes: 2 additions & 2 deletions R/addr_geocode.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ addr_match_geocode <- function(x,
x_addr_ref_match <-
addr_match(x_addr,
ref_addr,
stringdist_match = "osa_lt_1",
match_street_type = TRUE,
match_street_name = "osa_lt_1",
match_street_type = "exact",
simplify = TRUE
)

Expand Down
40 changes: 19 additions & 21 deletions R/addr_match.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
#' See `stringdist::stringdist-metrics` for more details on string metrics and the optimal string alignment (`osa`) method.
#' @param x an addr vector to match
#' @param ref_addr an addr vector to search for matches in
#' @param stringdist_match method for determining string match of street name:
#' @param match_street_name method for determining string match of street name:
#' "osa_lt_1" requires an optimized string distance less than 1; "exact" requires an exact match
#' @param match_street_type logical; require street type to be identical to match?
#' @param match_street_type method for determining string match of street type:
#' "exact" requires an exact match; "none" ignores street types
#' @param simplify logical; randomly select one addr from multi-matches and return an
#' addr() vector instead of a list? (empty addr vectors and NULL values are converted
#' to NA)
Expand All @@ -29,8 +30,8 @@
#' @export
addr_match <- function(x,
ref_addr,
stringdist_match = c("osa_lt_1", "exact"),
match_street_type = TRUE, # TODO change into c("exact", "none")
match_street_name = c("osa_lt_1", "exact"),
match_street_type = c("exact", "none"),
simplify = TRUE) {
ia <- stats::na.omit(unique(as_addr(x)))
ra <- unique(as_addr(ref_addr))
Expand All @@ -49,7 +50,7 @@ addr_match <- function(x,

matches <-
purrr::map(zip_list, \(.) addr_match_street_name_and_number(.$ia, .$ra,
stringdist_match = stringdist_match,
match_street_name = match_street_name,
match_street_type = match_street_type,
simplify = FALSE
),
Expand All @@ -74,20 +75,17 @@ addr_match <- function(x,
return(out)
}

## addr_match_zip <- function(input_addr, ref_addr) {
## zip_dist <- stringdist::stringdistmatrix(vctrs::field(input_addr, "zip_code"), vctrs::field(ref_addr, "zip_code"))
## exact_matches <- apply(zip_dist, MARGIN = 1, FUN = \(.) which(. == 0), simplify = FALSE)
## names(exact_matches) <- as.character(input_addr)
## return(exact_matches)
## }

#' match addresses street names and numbers
#'
#' @rdname addr_match
#' @export
addr_match_street_name_and_number <- function(x, ref_addr, stringdist_match = c("osa_lt_1", "exact"), match_street_type = TRUE, simplify = TRUE) {
addr_match_street_name_and_number <- function(x,
ref_addr,
match_street_name = c("osa_lt_1", "exact"),
match_street_type = c("exact", "none"),
simplify = TRUE) {
street_name_matches <-
addr_match_street(x, ref_addr, stringdist_match = stringdist_match, match_street_type = match_street_type)
addr_match_street(x, ref_addr, match_street_name = match_street_name, match_street_type = match_street_type)
street_number_matches <-
stringdist::stringdistmatrix(
vctrs::field(x, "street_number"),
Expand Down Expand Up @@ -115,23 +113,24 @@ addr_match_street_name_and_number <- function(x, ref_addr, stringdist_match = c(
#' @rdname addr_match
#' @export
addr_match_street <- function(x, ref_addr,
stringdist_match = c("osa_lt_1", "exact"),
match_street_type = TRUE) {
stringdist_match <- rlang::arg_match(stringdist_match)
match_street_name = c("osa_lt_1", "exact"),
match_street_type = c("exact", "none")) {
match_street_name <- rlang::arg_match(match_street_name)
match_street_type <- rlang::arg_match(match_street_type)

street_name_dist <-
stringdist::stringdistmatrix(vctrs::field(x, "street_name"), vctrs::field(ref_addr, "street_name"))

exact_matches <- apply(street_name_dist, MARGIN = 1, FUN = \(.) which(. == 0), simplify = FALSE)

if (stringdist_match == "exact") {
if (match_street_name == "exact") {
the_matches <- exact_matches
} else if (stringdist_match == "osa_lt_1") {
} else if (match_street_name == "osa_lt_1") {
one_off_matches <- apply(street_name_dist, MARGIN = 1, FUN = \(.) which(. == 1), simplify = FALSE)
the_matches <- ifelse(lapply(exact_matches, length) != 0, exact_matches, one_off_matches)
}

if (match_street_type) {
if (match_street_type == "exact") {
street_type_matches <-
stringdist::stringdistmatrix(vctrs::field(x, "street_type"), vctrs::field(ref_addr, "street_type")) |>
apply(MARGIN = 1, FUN = \(.) which(. == 0), simplify = FALSE)
Expand All @@ -140,5 +139,4 @@ addr_match_street <- function(x, ref_addr,
return(the_matches)
}


utils::globalVariables(c("ia_zips", "ra_zips"))
110 changes: 110 additions & 0 deletions R/addr_match2.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#' matching addr vectors
#'
#' Optimized String Alignment (OSA) distances are used to
#' choose a set of matching reference addr with flexible, field-specific thresholds.
#' See `fuzzy_match()`/`fuzzy_match_addr_field()` for more details.
#' @param x an addr vector to match
#' @param ref_addr an addr vector to search for matches in
#' @param simplify logical; randomly select one addr from multi-matches and return an
#' addr() vector instead of a list? (empty addr vectors and NULL values are converted
#' to NA)
#' @returns a named list of possible addr matches for each addr in `x`;
#' a list value of NULL means the zip code was not matched and
#' a list value of a zero-length addr vector means the zip code was matched,
#' but the street number, name, and type were not matched
#' @examples
#' addr(c("3333 Burnet Ave Cincinnati OH 45229", "5130 RAPID RUN RD CINCINNATI OHIO 45238")) |>
#' addr_match2(cagis_addr()$cagis_addr)
#'
#' addr(c("3333 Burnet Ave Cincinnati OH 45229", "5130 RAPID RUN RD CINCINNATI OHIO 45238")) |>
#' addr_match2(cagis_addr()$cagis_addr, simplify = FALSE) |>
#' tibble::enframe(name = "input_addr", value = "ca") |>
#' dplyr::mutate(ca = purrr::list_c(ca)) |>
#' dplyr::left_join(cagis_addr(), by = c("ca" = "cagis_addr")) |>
#' tidyr::unnest(cols = c(cagis_addr_data)) |>
#' dplyr::select(-ca, -cagis_address)
#' @export
addr_match2 <- function(x,
ref_addr,
max_dist_street_number = 0,
max_dist_street_name = 1,
max_dist_street_type = 0,
simplify = FALSE) {
ia <- stats::na.omit(unique(as_addr(x)))
ra <- unique(as_addr(ref_addr))

ia_zip_list <- split(ia, vctrs::field(ia, "zip_code"))

ra_zip_list <- split(ra, vctrs::field(ra, "zip_code"))
ra_zip_list <-
names(ia_zip_list) |>
purrr::map(\(.) purrr::pluck(ra_zip_list, ., .default = NA)) |>
purrr::set_names(names(ia_zip_list))

zip_list <-
purrr::transpose(list(ia = ia_zip_list, ra = ra_zip_list)) |>
purrr::discard(\(.) any(is.na(.$ia), is.na(.$ra)))

matches <-
purrr::map(zip_list, \(.x) addr_match_line_one(
.x$ia, .x$ra,
max_dist_street_number = max_dist_street_number,
max_dist_street_name = max_dist_street_name,
max_dist_street_type = max_dist_street_type,
simplify = FALSE
),
.progress = list(
clear = FALSE,
format = "matching addresses in {cli::pb_current}/{cli::pb_total} ZIP codes [{cli::pb_elapsed} elapsed] "
)
) |>
purrr::list_flatten(name_spec = "{inner}")

out <- matches[as.character(x)]
names(out) <- as.character(x)

if (simplify) {
out <-
out |>
purrr::modify_if(\(.) length(.) > 1, sample, size = 1) |>
purrr::modify_if(\(.) length(.) == 0, \(.) NA) |>
purrr::list_c(ptype = addr())
}

return(out)
}

#' match addr vectors based on street number, name, and type
#'
#' @param max_dist_street_number maximum OSA distance to consider a match for the addr street_number
#' @param max_dist_street_name maximum OSA distance to consider a match for the addr street_name
#' @param max_dist_street_type maximum OSA distance to consider a match for the addr street_type
#' @rdname addr_match2
#' @export
addr_match_line_one <- function(x, ref_addr,
max_dist_street_number = 0,
max_dist_street_name = 1,
max_dist_street_type = 0,
simplify = FALSE) {
matches <- list()
matches$street_number <- fuzzy_match_addr_field(x, ref_addr, "street_number", max_dist_street_number)
matches$street_name <- fuzzy_match_addr_field(x, ref_addr, "street_name", max_dist_street_name)
matches$street_type <- fuzzy_match_addr_field(x, ref_addr, "street_name", max_dist_street_name)

out <-
purrr::reduce(matches, \(.x, .y) purrr::map2(.x, .y, intersect)) |>
purrr::modify_if(\(.x) all(is.na(.x)), \(.x) NULL) |>
purrr::map(\(.x) ref_addr[.x]) |>
purrr::set_names(x)

if (simplify) {
out <-
out |>
purrr::modify_if(\(.x) length(.x) > 1, sample, size = 1) |>
purrr::modify_if(\(.x) length(.x) == 0, \(.x) NA) |>
purrr::list_c(ptype = addr())
}
return(out)
}

utils::globalVariables(c("ia_zips", "ra_zips"))
4 changes: 2 additions & 2 deletions R/addr_tiger_match.R
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ addr_match_tiger_street_ranges <- function(x,
street_matches <-
addr_match_street(ia,
suppressWarnings(as_addr(glue::glue("1234 {names(d_tiger)} Anytown AB 00000"))),
stringdist_match = "osa_lt_1",
match_street_type = TRUE
match_street_name = "osa_lt_1",
match_street_type = "exact"
) |>
purrr::map(\(.) d_tiger[.]) |>
purrr::map(purrr::pluck, 1,
Expand Down
49 changes: 49 additions & 0 deletions R/fuzzy_match.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#' fuzzy match strings in x to y using optimized string alignment (ignoring capitalization)
#' @param x character vector to match
#' @param y character vector to match to
#' @param osa_max_dist maximum OSA distance to consider a match
#' @param ties if multiple strings in `y` are tied for the minimum osa distances with a string in `x`,
#' then specify "first" or "random" as a tiebreaker
#' @return an integer vector representing the position of the best matching string in `y` for each string in `x`;
#' when `ties` is "all", a list of integer vectors is returned instead
#' @details `fuzzy_match_addr_field` is a helper to match addr vectors using fuzzy_match on a specific field
fuzzy_match <- function(x, y, osa_max_dist = 1, ties = c("first", "random", "all")) {
if (!rlang::is_character(x)) rlang::abort("x must be a character")
if (!rlang::is_character(y)) rlang::abort("y must be a character")
if (!rlang::is_bare_numeric(osa_max_dist)) rlang::abort("osa_max_dist must be a numeric")
ties <- rlang::arg_match(ties)

the_dist <- stringdist::stringdistmatrix(tolower(x), tolower(y), method = "osa")
min_dist_matches <- apply(the_dist,
MARGIN = 1,
FUN = \(.) which(min(.) <= osa_max_dist & . == min(.)),
simplify = FALSE
)
out <- purrr::modify_if(min_dist_matches, \(.) length(.) == 0, \(.) NA)

if (ties == "random") {
out <- purrr::modify_if(out, \(.) length(.) > 1, sample, size = 1)
}
if (ties == "first") {
out <- purrr::modify_if(out, \(.) length(.) > 1, \(tmp) tmp[1])
}
if (ties %in% c("random", "first")) out <- purrr::list_c(out, ptype = integer(1))

return(out)
}

#' @param x_addr addr vector to match
#' @param y_addr addr vector to match to
#' @param addr_field character name of `addr()` field to match on
#' @rdname fuzzy_match
fuzzy_match_addr_field <- function(x_addr, y_addr, addr_field, osa_max_dist = 0, ties = "all") {
if (!inherits(x_addr, "addr")) rlang::abort("x_addr must be an addr object")
if (!inherits(y_addr, "addr")) rlang::abort("y_addr must be an addr object")
rlang::check_required(addr_field)
addr_field <- rlang::arg_match(addr_field, vctrs::fields(x_addr))
out <-
purrr::map(list(x_addr, y_addr), \(.x) as.character(vctrs::field(.x, addr_field))) |>
purrr::list_modify(osa_max_dist = osa_max_dist, ties = ties) |>
do.call(fuzzy_match, args = _)
return(out)
}
17 changes: 9 additions & 8 deletions man/addr_match.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

63 changes: 63 additions & 0 deletions man/addr_match2.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading