patternapply.R (4049B)
1 #' patternappy: A package for attempting to apply patterns to a vector. 2 #' 3 #' This package contains the function \code{patternapply()}, which iteratively 4 #' tries to match a list of regular expressions to a vector and returns the 5 #' associated replacement vector. This is useful for turning a column of text 6 #' data with different formats into a matrix or data frame. S3 generics are also 7 #' provided to assist with this. 8 #' 9 #' @docType package 10 #' @name patternapply-package 11 NULL 12 13 14 #' Iteratively try patterns against a character vector. 15 #' 16 #' @param X A character vector where matches are sought. 17 #' @param patterns A vector of regular expression patterns. 18 #' @param replacements A list of replacement information, which must match the 19 #' length of \code{patterns}. Each element must be a character vector. This 20 #' can include backreferences "\1" to "\9" to parenthesized subexpressions of 21 #' the corresponding pattern. 22 #' 23 #' @return A list of replacement vectors with class "replacement_list". 24 #' 25 #' @export 26 patternapply <- function(X, patterns, 27 replacements = as.list(paste(seq_along(patterns)))) { 28 # Check the inputs 29 if (class(patterns) != "character") { 30 stop("'patterns' must be a character vector") 31 } 32 if (class(replacements) != "list") { 33 stop("'replacements' must be a list") 34 } 35 if (!all(vapply(replacements, function(x) class(x) == "character", logical(1)))) { 36 stop("'replacements' can only contain character vectors") 37 } 38 if (length(patterns) != length(replacements)) { 39 stop("'patterns' and 'replacements' must have the same length") 40 } 41 42 # Keep track of which records have already been matched to a pattern. 43 matchIndex <- rep(NA_integer_, length(X)) 44 45 # This will contain a replacement vector for each element in X. 46 replacementList <- vector("list", length(replacements)) 47 48 for (i in seq_along(patterns)) { 49 # Match the pattern to the record, omitting items already matched. 50 matchData <- regexec(patterns[i], X[is.na(matchIndex)]) 51 52 # Find all the places where matches occurred. 53 matches <- vapply(matchData, `[`, integer(1), 1) != -1 54 numMatches <- sum(matches) 55 56 # Where are new matches? 57 matchedHere <- rep(FALSE, length(X)) 58 matchedHere[is.na(matchIndex)] <- matches 59 60 # Figure out which of the replacements are backreferences (and not just 61 # strings). 62 indexReplacements <- vapply(regmatches(replacements[[i]], 63 regexec("^\\\\([1-9])$", 64 replacements[[i]])), 65 function(x) as.integer(x[2]), 66 integer(1)) 67 stringReplacements <- replacements[[i]] 68 stringReplacements[!is.na(indexReplacements)] <- NA 69 70 # Grab the matched groups 71 matchedStrings <- regmatches(X[matchedHere], matchData[matches]) 72 73 # Create a replacement matrix. Each column is the replacement vector for a 74 # record. 75 backreferenceMatrix <- vapply(matchedStrings, `[`, 76 character(length(indexReplacements)), 77 indexReplacements+1) 78 replacementMatrix <- matrix(rep(stringReplacements, numMatches), 79 ncol = numMatches) 80 replacementMatrix[is.na(replacementMatrix)] <- backreferenceMatrix[is.na(replacementMatrix)] 81 82 # Fill in the data for the new matches. 83 replacementList[matchedHere] <- lapply(seq_len(numMatches), 84 function(i) replacementMatrix[, i]) 85 86 # Update the indices of the matches 87 matchIndex[matchedHere] <- i 88 } 89 90 # Using col_names to hopefully support named vectors in the future. 91 maxReplacements <- max(vapply(replacements, length, integer(1))) 92 attr(replacementList, "col_names") <- paste0("match_", seq_len(maxReplacements)) 93 94 # This will probably be useful for people. 95 attr(replacementList, "match_index") <- matchIndex 96 97 # S3 Object 98 class(replacementList) <- "replacement_list" 99 100 return(replacementList) 101 }