patternapply

Iteratively try patterns against a character vector.
git clone https://git.eamoncaddigan.net/patternapply.git
Log | Files | Refs | README | LICENSE

commit c858ad82b888c55529d15e2d8c1c1acb66b57c74
parent 51c38bc9ecc4eb7932234c0e58547f0caecbb664
Author: eamoncaddigan <eamon.caddigan@gmail.com>
Date:   Mon, 22 Feb 2016 16:48:09 -0500

Working(ish) code.

Diffstat:
MR/patternapply.R | 75++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
1 file changed, 54 insertions(+), 21 deletions(-)

diff --git a/R/patternapply.R b/R/patternapply.R @@ -1,37 +1,70 @@ #' Iteratively try patterns against a character vector. -#' +#' #' @param X A character vector where matches are sought. #' @param patterns A vector of regular expression patterns. -#' @param replacements A vector of replacement information, must match the -#' length of \code(patterns). This can either be a character vector or list of -#' character vectors. This can include backreferences "\1" to "\9" to -#' parenthesized subexpressions of the corresponding pattern. -#' -#' @return A vector of replacements. Matches the format of \code(replacements). +#' @param replacements A list of replacement information, must match the length +#' of \code(patterns). Each element must be a character vector. This can +#' include backreferences "\1" to "\9" to parenthesized subexpressions of the +#' corresponding pattern. +#' +#' @return A list of replacement vectors with class "replacement_list". patternapply <- function(X, patterns, - replacements = paste(seq_along(patterns))) { - + replacements = list(paste(seq_along(patterns)))) { # Keep track of which records have already been matched to a pattern. - matchFalses <- rep(FALSE, length(X)) - matchedAlready <- matchFalses + matchIndex <- rep(NA_integer_, length(X)) - for (pattern in patterns) { - # Match the pattern to the - matchedIndices <- regexec(pattern, X[!matchedAlready]) + # This will contain a replacement vector for each element in X. + replacementList <- list() + + for (i in seq_along(patterns)) { + # Match the pattern to the record, omitting items already matched. + matchData <- regexec(patterns[i], X[is.na(matchIndex)]) # Find all the places where matches occurred. - matches <- vapply(matchedIndices, `[`, integer(1), 1) != -1 - matchedStrings <- regmatches(X, matchedIndices) + matches <- vapply(matchData, `[`, integer(1), 1) != -1 + numMatches <- sum(matches) # Where are new matches? - matchedHere <- matchFalses - matchedHere[!matchedAlready] <- vapply(matchedIndices, `[`, integer(1), 1) != -1 + matchedHere <- rep(FALSE, length(X)) + matchedHere[is.na(matchIndex)] <- matches + + # Figure out which of the replacements are backreferences (and not just + # strings). + indexReplacements <- vapply(regmatches(replacements[[i]], + regexec("^\\\\([1-9])$", + replacements[[i]])), + function(x) as.integer(x[2]), + integer(1)) + stringReplacements <- replacements[[i]] + stringReplacements[!is.na(indexReplacements)] <- NA + + # Grab the matched groups + matchedStrings <- regmatches(X[matchedHere], matchData[matches]) + + # Create a replacement matrix. Each column is the replacement vector for a + # record. + backreferenceMatrix <- vapply(matchedStrings, `[`, + character(length(indexReplacements)), + indexReplacements+1) + replacementMatrix <- matrix(rep(stringReplacements, numMatches), + ncol = numMatches) + replacementMatrix[is.na(replacementMatrix)] <- backreferenceMatrix[is.na(replacementMatrix)] # Fill in the data for the new matches. - artists[matchedHere, commonCols] <- bioData[bioData$is_match, commonCols] + replacementList[matchedHere] <- lapply(seq_len(numMatches), + function(i) replacementMatrix[, i]) - # Update the list of matched rows. - matchedAlready <- matchedAlready | matchedHere + # Update the indices of the matches + matchIndex[matchedHere] <- i } + + # Using col_names to hopefully support named vectors in the future. + maxReplacements <- max(vapply(replacements, length, integer(1))) + attr(replacementList, "col_names") <- paste0("match_", seq_len(maxReplacements)) + + # This will probably be useful for people. + attr(replacementList, "match_index") <- matchIndex + + return(replacementList) }