Created
March 18, 2020 06:34
-
-
Save ahmedsadman/93c267d49058349cb4de8019bd6bc220 to your computer and use it in GitHub Desktop.
Implementation of Apriori Algorithm for mining frequent patterns
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Description : Simple Python implementation of the Apriori Algorithm | |
Usage: | |
$python apriori.py -f DATASET.csv -s minSupport -c minConfidence | |
$python apriori.py -f DATASET.csv -s 0.15 -c 0.6 | |
Original Author: github.com/asaini | |
Modifications: github.com/ahmedsadman (Converted to Python 3 and output fixes) | |
Example Dataset: https://github.com/asaini/Apriori/blob/master/INTEGRATED-DATASET.csv | |
""" | |
import sys | |
from itertools import chain, combinations | |
from collections import defaultdict | |
from optparse import OptionParser | |
def subsets(arr): | |
""" Returns non empty subsets of arr""" | |
return chain(*[combinations(arr, i + 1) for i, a in enumerate(arr)]) | |
def returnItemsWithMinSupport(itemSet, transactionList, minSupport, freqSet): | |
"""calculates the support for items in the itemSet and returns a subset | |
of the itemSet each of whose elements satisfies the minimum support""" | |
_itemSet = set() | |
localSet = defaultdict(int) | |
for item in itemSet: | |
for transaction in transactionList: | |
if item.issubset(transaction): | |
freqSet[item] += 1 | |
localSet[item] += 1 | |
for item, count in localSet.items(): | |
support = float(count)/len(transactionList) | |
if support >= minSupport: | |
_itemSet.add(item) | |
return _itemSet | |
def joinSet(itemSet, length): | |
"""Join a set with itself and returns the n-element itemsets""" | |
return set([i.union(j) for i in itemSet for j in itemSet if len(i.union(j)) == length]) | |
def getItemSetTransactionList(data_iterator): | |
transactionList = list() | |
itemSet = set() | |
for record in data_iterator: | |
transaction = frozenset(record) | |
transactionList.append(transaction) | |
for item in transaction: | |
itemSet.add(frozenset([item])) # Generate 1-itemSets | |
return itemSet, transactionList | |
def runApriori(data_iter, minSupport, minConfidence): | |
""" | |
run the apriori algorithm. data_iter is a record iterator | |
Return both: | |
- items (tuple, support) | |
- rules ((pretuple, posttuple), confidence) | |
""" | |
itemSet, transactionList = getItemSetTransactionList(data_iter) | |
freqSet = defaultdict(int) | |
largeSet = dict() | |
# Global dictionary which stores (key=n-itemSets,value=support) | |
# which satisfy minSupport | |
assocRules = dict() | |
# Dictionary which stores Association Rules | |
oneCSet = returnItemsWithMinSupport(itemSet, | |
transactionList, | |
minSupport, | |
freqSet) | |
currentLSet = oneCSet | |
k = 2 | |
while(currentLSet != set([])): | |
largeSet[k-1] = currentLSet | |
currentLSet = joinSet(currentLSet, k) | |
currentCSet = returnItemsWithMinSupport(currentLSet, | |
transactionList, | |
minSupport, | |
freqSet) | |
currentLSet = currentCSet | |
k = k + 1 | |
def getSupport(item): | |
"""local function which Returns the support of an item""" | |
return float(freqSet[item])/len(transactionList) | |
toRetItems = [] | |
for key, value in largeSet.items(): | |
toRetItems.extend([(tuple(item), getSupport(item)) | |
for item in value]) | |
toRetRules = [] | |
for key, value in list(largeSet.items())[1:]: | |
for item in value: | |
_subsets = map(frozenset, [x for x in subsets(item)]) | |
for element in _subsets: | |
remain = item.difference(element) | |
if len(remain) > 0: | |
confidence = getSupport(item)/getSupport(element) | |
if confidence >= minConfidence: | |
toRetRules.append(((tuple(element), tuple(remain)), | |
confidence)) | |
return toRetItems, toRetRules | |
def printResults(items, rules): | |
"""prints the generated itemsets sorted by support and the confidence rules sorted by confidence""" | |
for item, support in sorted(items, key=lambda item__support: item__support[1]): | |
print("item: %s , %.3f" % (str(item), support)) | |
print("\n------------------------ RULES:") | |
for rule, confidence in sorted(rules, key=lambda rule__confidence: rule__confidence[1]): | |
pre, post = rule | |
print("Rule: %s ==> %s , %.3f" % (str(pre), str(post), confidence)) | |
def dataFromFile(fname): | |
"""Function which reads from the file and yields a generator""" | |
file_iter = open(fname, 'r') | |
for line in file_iter: | |
line = line.strip().rstrip(',') # Remove trailing comma | |
record = frozenset(line.split(',')) | |
yield record | |
if __name__ == "__main__": | |
optparser = OptionParser() | |
optparser.add_option('-f', '--inputFile', | |
dest='input', | |
help='filename containing csv', | |
default=None) | |
optparser.add_option('-s', '--minSupport', | |
dest='minS', | |
help='minimum support value', | |
default=0.15, | |
type='float') | |
optparser.add_option('-c', '--minConfidence', | |
dest='minC', | |
help='minimum confidence value', | |
default=0.6, | |
type='float') | |
(options, args) = optparser.parse_args() | |
inFile = None | |
if options.input is None: | |
inFile = sys.stdin | |
elif options.input is not None: | |
inFile = dataFromFile(options.input) | |
else: | |
print('No dataset filename specified, system with exit\n') | |
sys.exit('System will exit') | |
minSupport = options.minS | |
minConfidence = options.minC | |
items, rules = runApriori(inFile, minSupport, minConfidence) | |
printResults(items, rules) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment