Created
January 7, 2011 23:45
-
-
Save rtirrell/770325 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Rprof() | |
# Read a list of about about 100K vectors, each with fewer than 30 items | |
# (most with a few). These are supermarket-type transactions. | |
transactions = lapply(strsplit(readLines('Data/retail.dat'), ' '), as.numeric) | |
transactions.unlisted = unlist(transactions) | |
# Count the total number of items over all transactions. | |
nitems = length(transactions.unlisted) | |
# And the number of occurrences of each item. | |
counts = table(transactions.unlisted) | |
# The minimum count required for a k-tuple to be considered a frequent | |
# set of items. | |
threshold.count = 0.02 * length(transactions) | |
# Set of frequent tuples for k = 1. | |
frequent = list( | |
single = (1:length(counts))[counts > threshold.count] | |
) | |
# All candidate 2-tuples. | |
frequent$cdouble = t(combn(frequent$single, 2)) | |
# Only some of which are truly frequent (occur in more than threshold.count | |
# transactions). | |
rm.rows = numeric(nrow(frequent$cdouble)) | |
for (i in 1:5) { | |
if (sum(sapply(transactions, function(r) all(frequent$cdouble[i,] %in% r))) < threshold.count) { | |
# if (sum(sapply(transactions, function(r) length(intersect(frequent$cdouble[i,], r)) == 2)) < threshold.count) { | |
rm.rows[i] = 1 | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment