Created
June 11, 2012 08:34
Implementation of the Apriori Algorithm (as presented in the lecture "Knowledge Discovery in Databases" at LMU Munich) in Haskell including HUnit tests.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Main where | |
import qualified Data.List as List | |
import qualified Data.Set as Set | |
import Test.HUnit | |
main :: IO () | |
main = do { runTestTT allTests ; return () } | |
data Item = Item Char deriving (Eq, Ord) | |
instance Show Item where | |
show (Item c) = [c] | |
data Itemset = Itemset (Set.Set Item) deriving (Eq, Ord) | |
instance Show Itemset where | |
show (Itemset is) = let helper = foldl (++) "" $ map show $ Set.toList is in | |
"{" ++ (List.intersperse ',' helper) ++ "}" | |
type Support = Double | |
type Confidence = Double | |
data AssociationRule = AssociationRule Itemset Itemset Support Confidence deriving (Eq, Ord) | |
instance Show AssociationRule where | |
show (AssociationRule x y s c) = (show x) ++ " => " ++ (show y) ++ " (" ++ (show s) ++ ", " ++ (show c) ++ ")" | |
showList [] s = "[]" ++ s | |
showList (x:xs) s = "[\n " ++ (show x) ++ (showl xs) | |
where | |
showl [] = "\n]" ++ s | |
showl (y:ys) = ",\n " ++ show y ++ showl ys | |
itemsetFromString :: String -> Itemset | |
itemsetFromString s = Itemset $ Set.fromList $ map Item s | |
itemsetsFromStrings :: [String] -> [Itemset] | |
itemsetsFromStrings ss = map itemsetFromString ss | |
allItems :: Itemset | |
allItems = itemsetFromString "ABCDEFGHIKLM" | |
allItemsCount (Itemset is) = Set.size is | |
oneItemsets :: [Itemset] | |
oneItemsets = let asList (Itemset is) = Set.toList is in | |
map (\i -> Itemset $ Set.fromList [i]) $ asList allItems | |
transactions :: [Itemset] | |
transactions = itemsetsFromStrings ["BEGH", "ABCEGH", "ABCEFH", "BCDEFGHL", "ABEKH", "BEFGHIK", "ABDGH", "ABDG", "BDFG", "CEF", "ACEFH", "ABEG"] | |
transactionCount = length transactions | |
supportCount :: Itemset -> Int | |
supportCount (Itemset is) = length $ filter (Set.isSubsetOf is) $ map (\(Itemset x) -> x) transactions | |
support :: Itemset -> Support | |
support is = (fromIntegral $ supportCount is) / (fromIntegral $ transactionCount) | |
confidence :: Itemset -> Itemset -> Confidence | |
confidence lhsis@(Itemset lhs) (Itemset rhs) = (support $ Itemset $ lhs `Set.union` rhs) / (support lhsis) | |
------------------------------------------------------------------------ | |
-- Utilities | |
------------------------------------------------------------------------ | |
allSubsets :: (Ord a) => Set.Set a -> [Set.Set a] | |
allSubsets s = map Set.fromList $ List.subsequences $ Set.toList s | |
allMaximalProperSubsets :: (Ord a) => Set.Set a -> [Set.Set a] | |
allMaximalProperSubsets s = filter (\x -> (Set.size x) == (Set.size s) - 1) $ allSubsets s | |
------------------------------------------------------------------------ | |
-- Frequent Itemset Mining (FIM) | |
-- Usage: frequentItemsetMining 0.3 | |
------------------------------------------------------------------------ | |
frequentItemsetMining :: Support -> [Itemset] | |
frequentItemsetMining ms = fim oneItemsets | |
where | |
fim :: [Itemset] -> [Itemset] | |
fim [] = [] | |
fim iss = let piss = prune iss in piss ++ (fim $ frequentItemsetMiningCandidates piss) | |
-- prune is not entirely correct: this algorithm is iterating over all transactions for each of the candidates. | |
-- it's possible to achieve the same result with a single iteration. however, i'm to lazy to implement that now. | |
prune :: [Itemset] -> [Itemset] | |
prune iss = filter checkMinSupport iss | |
checkMinSupport is = support(is) >= ms | |
frequentItemsetMiningCandidates :: [Itemset] -> [Itemset] | |
frequentItemsetMiningCandidates iss = | |
-- the two k - 1 sets should differ in excactly 1 element before joining: | |
let validateCandidate a b = (Set.size $ a `Set.difference` b) == 1 in | |
let selfJoin = [Itemset (a `Set.union` b) | (Itemset a) <- iss, (Itemset b) <- iss, validateCandidate a b] in | |
let nonFrequentSubsets (Itemset s) = all (\s -> (Itemset s) `elem` iss) (allMaximalProperSubsets s) in | |
List.nub $ filter nonFrequentSubsets selfJoin | |
------------------------------------------------------------------------ | |
-- Association Rule Mining (ARM) | |
-- Usage: associationRuleMining 0.6 $ frequentItemsetMining 0.3 | |
------------------------------------------------------------------------ | |
associationRuleMining :: Confidence -> [Itemset] -> [AssociationRule] | |
associationRuleMining mc iss = filter checkMinConfidence $ allRules iss | |
where | |
allRules :: [Itemset] -> [AssociationRule] | |
allRules [] = [] | |
allRules ((Itemset is):iss) = | |
let validSubsets = filter (\x -> (not $ Set.null x) && x /= is) $ allSubsets is in | |
let lhss = map (\subset -> Itemset subset) validSubsets in | |
let rhss = map (\(Itemset lhs) -> Itemset $ Set.difference is lhs) lhss in | |
let newRule (lhs, rhs) = AssociationRule lhs rhs (support lhs) (confidence lhs rhs) in | |
(map newRule $ zip lhss rhss) ++ (allRules iss) | |
checkMinConfidence (AssociationRule _ _ _ c) = c >= mc | |
------------------------------------------------------------------------ | |
-- Closed Frequent Itemsets (CFI) - A frequent itemset X is called | |
-- closed if there exists no frequent superset Y ⊇ X | |
-- with support(X) = support(Y). | |
-- Usage: closedFrequentItemsets $ frequentItemsetMining 0.3 | |
------------------------------------------------------------------------ | |
closedFrequentItemsets :: [Itemset] -> [Itemset] | |
closedFrequentItemsets iss = filter (not . hasSupersetWithEqualSupport) iss | |
where | |
hasSupersetWithEqualSupport :: Itemset -> Bool | |
hasSupersetWithEqualSupport is@(Itemset s) = | |
let equalSupport x y = (supportCount x) == (supportCount y) in | |
any (\ix@(Itemset x) -> s `Set.isProperSubsetOf` x && (equalSupport ix is)) iss | |
------------------------------------------------------------------------ | |
-- Maximal Frequent Itemsets (MFI) - A frequent itemset is called | |
-- maximal if it is not a subset of any other frequent itemset. | |
-- Usage: maximalFrequentItemsets $ frequentItemsetMining 0.3 | |
------------------------------------------------------------------------ | |
maximalFrequentItemsets :: [Itemset] -> [Itemset] | |
maximalFrequentItemsets iss = filter (not . isSubset) iss | |
where | |
isSubset :: Itemset -> Bool | |
isSubset (Itemset is) = any (\(Itemset x) -> is `Set.isProperSubsetOf` x) iss | |
------------------------------------------------------------------------ | |
-- Unit Tests | |
------------------------------------------------------------------------ | |
allTests = TestList [fimTests, armTests, cfiTests, mfiTests] | |
------------------------------------------------------------------------ | |
-- Unit Tests for FIM | |
------------------------------------------------------------------------ | |
fimTests = TestLabel "Frequent Itemset Mining" $ TestList [fimCompareWithOfficialSolution, fimCandidatesTest] | |
fimCompareWithOfficialSolution = (List.sort expected) ~=? (List.sort actual) | |
where | |
expected = itemsetsFromStrings $ expected1 ++ expected2 ++ expected3 ++ expected4 | |
expected1 = ["A", "B", "C", "D", "E", "F", "G", "H"] | |
expected2 = ["AB", "AE", "AG", "AH", "BD", "BE", "BF", "BG", "BH", "CE", "CF", "CH", "DG", "EF", "EG", "EH", "FH", "GH"] | |
expected3 = ["ABE", "ABG", "ABH", "AEH", "BDG", "BEG", "BEH", "BGH", "CEF", "CEH", "EFH", "EGH"] | |
expected4 = ["BEGH"] | |
actual = frequentItemsetMining 0.3 | |
fimCandidatesTest = (List.sort expected) ~=? (List.sort actual) | |
where | |
expected = itemsetsFromStrings ["ABE", "ABG", "ABH", "AEG", "AEH", "AGH", "BDG", "BEF", "BEG", "BEH", "BFH", "BGH", "CEF", "CEH", "CFH", "EFH", "EGH"] | |
actual = frequentItemsetMiningCandidates $ itemsetsFromStrings ["AB", "AE", "AG", "AH", "BD", "BE", "BF", "BG", "BH", "CE", "CF", "CH", "DG", "EF", "EG", "EH", "FH", "GH"] | |
------------------------------------------------------------------------ | |
-- Unit Tests for ARM | |
------------------------------------------------------------------------ | |
armTests = TestLabel "Association Rule Mining" $ TestList [armCompareWithOfficialSolution, armAllRules] | |
armCompareWithOfficialSolution = (List.sort expected) ~=? (List.sort actual) | |
where | |
expected = [newRule "BEG" "H", | |
newRule "EG" "BH", | |
newRule "BEH" "G", | |
newRule "BGH" "E", | |
newRule "GH" "BE", | |
newRule "EGH" "B"] | |
newRule x y = let xs = itemsetFromString x in | |
let ys = itemsetFromString y in | |
AssociationRule xs ys (support xs) (confidence xs ys) | |
actual = associationRuleMining 0.6 $ [itemsetFromString "BEGH"] | |
armAllRules = (List.sort expected) ~=? (List.sort actual) | |
where | |
expected = [newRule "BEG" "H", | |
newRule "BE" "GH", | |
newRule "BG" "EH", | |
newRule "B" "EGH", -- this rule is missing from the offical solution | |
newRule "EG" "BH", | |
newRule "E" "BGH", | |
newRule "G" "BEH", | |
newRule "BEH" "G", | |
newRule "BH" "EG", | |
newRule "EH" "BG", | |
newRule "BGH" "E", | |
newRule "GH" "BE", | |
newRule "H" "BEG", | |
newRule "EGH" "B"] | |
newRule x y = let xs = itemsetFromString x in | |
let ys = itemsetFromString y in | |
AssociationRule xs ys (support xs) (confidence xs ys) | |
actual = associationRuleMining 0.0 $ [itemsetFromString "BEGH"] | |
------------------------------------------------------------------------ | |
-- Unit Tests for CFI | |
------------------------------------------------------------------------ | |
cfiTests = TestLabel "Closed Frequent Itemsets" $ TestList [cfiCompareWithOfficialSolution] | |
cfiCompareWithOfficialSolution = (List.sort expected) ~=? (List.sort actual) | |
where | |
expected = itemsetsFromStrings $ expected1 ++ expected2 ++ expected3 ++ expected4 | |
expected1 = ["A", "B", "E", "F", "H"] | |
expected2 = ["AB", "AE", "AH", "BE", "BF", "BG", "BH", "CE", "EF", "EH"] | |
expected3 = ["ABE", "ABG", "ABH", "AEH", "BDG", "BEG", "BEH", "BGH", "CEF", "CEH", "EFH"] | |
expected4 = ["BEGH"] | |
actual = closedFrequentItemsets $ frequentItemsetMining 0.3 | |
------------------------------------------------------------------------ | |
-- Unit Tests for MFI | |
------------------------------------------------------------------------ | |
mfiTests = TestLabel "Maximal Frequent Itemsets" $ TestList [mfiCompareWithOfficialSolution] | |
mfiCompareWithOfficialSolution = (List.sort expected) ~=? (List.sort actual) | |
where | |
expected = itemsetsFromStrings $ expected2 ++ expected3 ++ expected4 | |
expected2 = ["BF"] | |
expected3 = ["ABE", "ABG", "ABH", "AEH", "BDG", "CEF", "CEH", "EFH"] | |
expected4 = ["BEGH"] | |
actual = maximalFrequentItemsets $ frequentItemsetMining 0.3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment