patternapply

Iteratively try patterns against a character vector.
git clone https://git.eamoncaddigan.net/patternapply.git
Log | Files | Refs | README | LICENSE

patternapply.R (4049B)


      1 #' patternappy: A package for attempting to apply patterns to a vector.
      2 #' 
      3 #' This package contains the function \code{patternapply()}, which iteratively 
      4 #' tries to match a list of regular expressions to a vector and returns the
      5 #' associated replacement vector. This is useful for turning a column of text
      6 #' data with different formats into a matrix or data frame. S3 generics are also
      7 #' provided to assist with this.
      8 #' 
      9 #' @docType package
     10 #' @name patternapply-package
     11 NULL
     12 
     13 
     14 #' Iteratively try patterns against a character vector.
     15 #'
     16 #' @param X A character vector where matches are sought.
     17 #' @param patterns A vector of regular expression patterns.
     18 #' @param replacements A list of replacement information, which must match the 
     19 #'   length of \code{patterns}. Each element must be a character vector. This
     20 #'   can include backreferences "\1" to "\9" to parenthesized subexpressions of
     21 #'   the corresponding pattern.
     22 #'
     23 #' @return A list of replacement vectors with class "replacement_list".
     24 #' 
     25 #' @export
     26 patternapply <- function(X, patterns,
     27                          replacements = as.list(paste(seq_along(patterns)))) {
     28   # Check the inputs
     29   if (class(patterns) != "character") {
     30     stop("'patterns' must be a character vector")
     31   }
     32   if (class(replacements) != "list") {
     33     stop("'replacements' must be a list")
     34   }
     35   if (!all(vapply(replacements, function(x) class(x) == "character", logical(1)))) {
     36     stop("'replacements' can only contain character vectors")
     37   }
     38   if (length(patterns) != length(replacements)) {
     39     stop("'patterns' and 'replacements' must have the same length")
     40   }
     41 
     42   # Keep track of which records have already been matched to a pattern.
     43   matchIndex <- rep(NA_integer_, length(X))
     44 
     45   # This will contain a replacement vector for each element in X.
     46   replacementList <- vector("list", length(replacements))
     47 
     48   for (i in seq_along(patterns)) {
     49     # Match the pattern to the record, omitting items already matched.
     50     matchData <- regexec(patterns[i], X[is.na(matchIndex)])
     51 
     52     # Find all the places where matches occurred.
     53     matches <- vapply(matchData, `[`, integer(1), 1) != -1
     54     numMatches <- sum(matches)
     55 
     56     # Where are new matches?
     57     matchedHere <- rep(FALSE, length(X))
     58     matchedHere[is.na(matchIndex)] <- matches
     59 
     60     # Figure out which of the replacements are backreferences (and not just
     61     # strings).
     62     indexReplacements <- vapply(regmatches(replacements[[i]],
     63                                            regexec("^\\\\([1-9])$",
     64                                                    replacements[[i]])),
     65                                 function(x) as.integer(x[2]),
     66                                 integer(1))
     67     stringReplacements <- replacements[[i]]
     68     stringReplacements[!is.na(indexReplacements)] <- NA
     69 
     70     # Grab the matched groups
     71     matchedStrings <- regmatches(X[matchedHere], matchData[matches])
     72 
     73     # Create a replacement matrix. Each column is the replacement vector for a
     74     # record.
     75     backreferenceMatrix <- vapply(matchedStrings, `[`,
     76                                   character(length(indexReplacements)),
     77                                   indexReplacements+1)
     78     replacementMatrix <- matrix(rep(stringReplacements, numMatches),
     79                                 ncol = numMatches)
     80     replacementMatrix[is.na(replacementMatrix)] <- backreferenceMatrix[is.na(replacementMatrix)]
     81 
     82     # Fill in the data for the new matches.
     83     replacementList[matchedHere] <- lapply(seq_len(numMatches),
     84                                            function(i) replacementMatrix[, i])
     85 
     86     # Update the indices of the matches
     87     matchIndex[matchedHere] <- i
     88   }
     89 
     90   # Using col_names to hopefully support named vectors in the future.
     91   maxReplacements <- max(vapply(replacements, length, integer(1)))
     92   attr(replacementList, "col_names") <- paste0("match_", seq_len(maxReplacements))
     93 
     94   # This will probably be useful for people.
     95   attr(replacementList, "match_index") <- matchIndex
     96 
     97   # S3 Object
     98   class(replacementList) <- "replacement_list"
     99 
    100   return(replacementList)
    101 }