import copy
import itertools
from collections import defaultdict
from operator import itemgetter
An event is a list of strings.
A sequence is a list of events.
A dataset is a list of sequences.
Thus, a dataset is a list of lists of lists of strings.
dataset = [
[["a"], ["a", "b", "c"], ["a", "c"], ["c"]],
[["a"], ["c"], ["b", "c"]],
[["a", "b"], ["d"], ["c"], ["b"], ["c"]],
[["a"], ["c"], ["b"], ["c"]]
]
"""
This is a simple recursive method that checks if subsequence is a subSequence of mainSequence
"""
def isSubsequence(mainSequence, subSequence):
subSequenceClone = list(subSequence) # clone the sequence, because we will alter it
return isSubsequenceRecursive(mainSequence, subSequenceClone) #start recursion
"""
Function for the recursive call of isSubsequence, not intended for external calls
"""
def isSubsequenceRecursive(mainSequence, subSequenceClone, start=0):
# Check if empty: End of recursion, all itemsets have been found
if (not subSequenceClone):
return True
# retrieves element of the subsequence and removes is from subsequence
firstElem = set(subSequenceClone.pop(0))
# Search for the first itemset...
for i in range(start, len(mainSequence)):
if (set(mainSequence[i]).issuperset(firstElem)):
# and recurse
return isSubsequenceRecursive(mainSequence, subSequenceClone, i + 1)
return False
aSequence = [["a"], ["b", "c"], ["d"], ["a", "e"]]
isSubsequence(aSequence, [["a"], ["d"], ["e"]])
True
isSubsequence(aSequence, [["a"], ["b", "c"], ["e"]])
True
isSubsequence(aSequence, [["a"], ["b", "d"]])
False
"""
Computes the length of the sequence (sum of the length of the contained itemsets)
"""
def sequenceLength(sequence):
return sum(len(i) for i in sequence)
print sequenceLength ([["a"], ["b", "c"], ["a"], ["b","c","d"]])
7
"""
Computes the support of a sequence in a dataset
"""
def countSupport (dataset, candidateSequence):
return sum(1 for seq in dataset if isSubsequence(seq, candidateSequence))
dataset
[[['a'], ['a', 'b', 'c'], ['a', 'c'], ['c']], [['a'], ['c'], ['b', 'c']], [['a', 'b'], ['d'], ['c'], ['b'], ['c']], [['a'], ['c'], ['b'], ['c']]]
countSupport(dataset, [["b"]])
4
countSupport(dataset, [["a"], ["b", "c"]])
2
"""
Generates one candidate of length k from two candidates of length (k-1) as used in the AprioriAll algorithm
"""
def generateCandidatesForPair(cand1, cand2):
cand1Clone = copy.deepcopy(cand1)
cand2Clone = copy.deepcopy(cand2)
# drop the leftmost item from cand1:
if (len (cand1[0]) == 1):
cand1Clone.pop(0)
else:
cand1Clone[0] = cand1Clone[0][1:]
# drop the rightmost item from cand2:
if (len (cand2[-1]) == 1):
cand2Clone.pop(-1)
else:
cand2Clone[-1] = cand2Clone[-1][:-1]
# if the result is not the same, then we dont need to join
if not cand1Clone == cand2Clone:
return []
else:
newCandidate = copy.deepcopy(cand1)
if (len (cand2[-1]) == 1):
newCandidate.append(cand2[-1])
else:
newCandidate [-1].extend(cand2[-1][-1])
return newCandidate
candidateA = [["a"], ["b", "c"], ["d"]]
candidateB = [["b", "c"], ["d", "e"]]
generateCandidatesForPair(candidateA, candidateB)
[['a'], ['b', 'c'], ['d', 'e']]
candidateA = [["a"], ["b", "c"], ["d"]]
candidateC = [["b", "c"], ["d"], ["e"]]
generateCandidatesForPair(candidateA, candidateC)
[['a'], ['b', 'c'], ['d'], ['e']]
candidateA = [["a"], ["b", "c"], ["d"]]
candidateD = [["a"], ["b", "c"], ["e"]]
generateCandidatesForPair(candidateA, candidateD)
[]
"""
Generates the set of candidates of length k from the set of frequent sequences with length (k-1)
"""
def generateCandidates(lastLevelCandidates):
k = sequenceLength(lastLevelCandidates[0]) + 1
if (k == 2):
flatShortCandidates = [item for sublist2 in lastLevelCandidates for sublist1 in sublist2 for item in sublist1]
result = [[[a, b]] for a in flatShortCandidates for b in flatShortCandidates if b > a]
result.extend([[[a], [b]] for a in flatShortCandidates for b in flatShortCandidates])
return result
else:
candidates = []
for i in range(0, len(lastLevelCandidates)):
for j in range(0, len(lastLevelCandidates)):
newCand = generateCandidatesForPair(lastLevelCandidates[i], lastLevelCandidates[j])
if (not newCand == []):
candidates.append(newCand)
candidates.sort()
return candidates
An example; Lets assume, we know the frequent sequences of level 2:
lastLevelFrequentPatterns = [
[['a', 'b']],
[['b', 'c']],
[['a'], ['b']],
[['a'], ['c']],
[['b'], ['c']],
[['c'], ['b']],
[['c'], ['c']],
]
Then we can compute the generate candidates for level 3
newCandidates = generateCandidates(lastLevelFrequentPatterns)
newCandidates
[[['a'], ['b'], ['c']], [['a'], ['b', 'c']], [['a'], ['c'], ['b']], [['a'], ['c'], ['c']], [['a', 'b'], ['c']], [['a', 'b', 'c']], [['b'], ['c'], ['b']], [['b'], ['c'], ['c']], [['b', 'c'], ['b']], [['b', 'c'], ['c']], [['c'], ['b'], ['c']], [['c'], ['b', 'c']], [['c'], ['c'], ['b']], [['c'], ['c'], ['c']]]
"""
Computes all direct subsequence for a given sequence.
A direct subsequence is any sequence that originates from deleting exactly one item from any event in the original sequence.
"""
def generateDirectSubsequences(sequence):
result = []
for i, itemset in enumerate(sequence):
if (len(itemset) == 1):
sequenceClone = copy.deepcopy(sequence)
sequenceClone.pop(i)
result.append(sequenceClone)
else:
for j in range(len(itemset)):
sequenceClone = copy.deepcopy(sequence)
sequenceClone[i].pop(j)
result.append(sequenceClone)
return result
"""
Prunes the set of candidates generated for length k given all frequent sequence of level (k-1), as done in AprioriAll
"""
def pruneCandidates(candidatesLastLevel, candidatesGenerated):
return [cand for cand in candidatesGenerated if all(x in candidatesLastLevel for x in generateDirectSubsequences(cand))]
We apply this on example dataset
candidatesPruned = pruneCandidates(lastLevelFrequentPatterns, newCandidates)
candidatesPruned
[[['a'], ['b'], ['c']], [['a'], ['b', 'c']], [['a'], ['c'], ['b']], [['a'], ['c'], ['c']], [['a', 'b'], ['c']], [['b'], ['c'], ['c']], [['b', 'c'], ['c']], [['c'], ['b'], ['c']], [['c'], ['b', 'c']], [['c'], ['c'], ['b']], [['c'], ['c'], ['c']]]
minSupport = 2
candidatesCounts = [(i, countSupport(dataset, i)) for i in candidatesPruned]
resultLvl = [(i, count) for (i, count) in candidatesCounts if (count >= minSupport)]
resultLvl
[([['a'], ['b'], ['c']], 3), ([['a'], ['b', 'c']], 2), ([['a'], ['c'], ['b']], 3), ([['a'], ['c'], ['c']], 4), ([['a', 'b'], ['c']], 2), ([['b'], ['c'], ['c']], 2), ([['c'], ['b'], ['c']], 2)]
"""
The AprioriAll algorithm. Computes the frequent sequences in a seqeunce dataset for a given minSupport
Args:
dataset: A list of sequences, for which the frequent (sub-)sequences are computed
minSupport: The minimum support that makes a sequence frequent
verbose: If true, additional information on the mining process is printed (i.e., candidates on each level)
Returns:
A list of tuples (s, c), where s is a frequent sequence, and c is the count for that sequence
"""
def apriori(dataset, minSupport, verbose=False):
global numberOfCountingOperations
numberOfCountingOperations = 0
Overall = []
itemsInDataset = sorted(set ([item for sublist1 in dataset for sublist2 in sublist1 for item in sublist2]))
singleItemSequences = [[[item]] for item in itemsInDataset]
singleItemCounts = [(i, countSupport(dataset, i)) for i in singleItemSequences if countSupport(dataset, i) >= minSupport]
Overall.append(singleItemCounts)
print "Result, lvl 1: " + str(Overall[0])
k = 1
while (True):
if not Overall [k - 1]:
break
# 1. Candidate generation
candidatesLastLevel = [x[0] for x in Overall[k - 1]]
candidatesGenerated = generateCandidates (candidatesLastLevel)
# 2. Candidate pruning (using a "containsall" subsequences)
candidatesPruned = [cand for cand in candidatesGenerated if all(x in candidatesLastLevel for x in generateDirectSubsequences(cand))]
# 3. Candidate checking
candidatesCounts = [(i, countSupport(dataset, i)) for i in candidatesPruned]
resultLvl = [(i, count) for (i, count) in candidatesCounts if (count >= minSupport)]
if verbose:
print "Candidates generated, lvl " + str(k + 1) + ": " + str(candidatesGenerated)
print "Candidates pruned, lvl " + str(k + 1) + ": " + str(candidatesPruned)
print "Result, lvl " + str(k + 1) + ": " + str(resultLvl)
Overall.append(resultLvl)
k = k + 1
# "flatten" Overall
Overall = Overall [:-1]
Overall = [item for sublist in Overall for item in sublist]
return Overall
apriori(dataset, 2, verbose=False)
Result, lvl 1: [([['a']], 4), ([['b']], 4), ([['c']], 4)]
[([['a']], 4), ([['b']], 4), ([['c']], 4), ([['a', 'b']], 2), ([['b', 'c']], 2), ([['a'], ['b']], 4), ([['a'], ['c']], 4), ([['b'], ['c']], 3), ([['c'], ['b']], 3), ([['c'], ['c']], 4), ([['a'], ['b'], ['c']], 3), ([['a'], ['b', 'c']], 2), ([['a'], ['c'], ['b']], 3), ([['a'], ['c'], ['c']], 4), ([['a', 'b'], ['c']], 2), ([['b'], ['c'], ['c']], 2), ([['c'], ['b'], ['c']], 2), ([['a'], ['c'], ['b'], ['c']], 2), ([['a', 'b'], ['c'], ['c']], 2)]
"""
Projects a sequence according to a given prefix, as done in PrefixSpan
Args:
sequence: the sequence the projection is built from
prefix: the prefix that is searched for in the sequence
newEvent: if set to True, the first itemset is ignored
Returns:
If the sequence does not contain the prefix, then None.
Otherwise, a new sequence starting from the position of the prefix, including the itemset that includes the prefix
"""
def projectSequence(sequence, prefix, newEvent):
result = None
for i, itemset in enumerate(sequence):
if result is None:
if (not newEvent) or i > 0:
if (all(x in itemset for x in prefix)):
result = [list(itemset)]
else:
result.append(copy.copy(itemset))
return result
seq = [["a"], ["b", "c"], ["a", "c"], ["c"]]
projectSequence(seq, ["b"], False)
[['b', 'c'], ['a', 'c'], ['c']]
projectSequence(seq, ["a", "c"], False)
[['a', 'c'], ['c']]
projectSequence(seq, ["a"], False)
[['a'], ['b', 'c'], ['a', 'c'], ['c']]
projectSequence(seq, ["a"], True)
[['a', 'c'], ['c']]
"""
Projects a dataset according to a given prefix, as done in PrefixSpan
Args:
dataset: the dataset the projection is built from
prefix: the prefix that is searched for in the sequence
newEvent: if set to True, the first itemset is ignored
Returns:
A (potentially empty) list of sequences
"""
def projectDatabase(dataset, prefix, newEvent):
projectedDB = []
for sequence in dataset:
seqProjected = projectSequence(sequence, prefix, newEvent)
if not seqProjected is None:
projectedDB.append(seqProjected)
return projectedDB
datasetProject = [
[["a"], ["a", "b", "c"], ["a", "c"], ["d"], ["c", "f"]],
[["a", "d"], ["c"], ["b", "c"], ["a", "e"]],
[["e", "f"], ["a", "b"], ["d", "f"], ["d"], ["b"]],
[["e"], ["g"], ["a", "f"], ["c"], ["b"], ["c"]]
]
projectDatabase(datasetProject, ["c"], False)
[[['a', 'b', 'c'], ['a', 'c'], ['d'], ['c', 'f']], [['c'], ['b', 'c'], ['a', 'e']], [['c'], ['b'], ['c']]]
"""
Generates a list of all items that are contained in a dataset
"""
def generateItems(dataset):
return sorted(set ([item for sublist1 in dataset for sublist2 in sublist1 for item in sublist2]))
"""
Computes a defaultdict that maps each item in the dataset to its support
"""
def generateItemSupports(dataset, ignoreFirstEvent=False, prefix=[]):
result = defaultdict(int)
for sequence in dataset:
if ignoreFirstEvent:
sequence = sequence[1:]
cooccurringItems = set()
for itemset in sequence:
if all(x in itemset for x in prefix):
for item in itemset:
if not item in prefix:
cooccurringItems.add(item)
for item in cooccurringItems:
result [item] += 1
return sorted(result.items())
"""
The PrefixSpan algorithm. Computes the frequent sequences in a seqeunce dataset for a given minSupport
Args:
dataset: A list of sequences, for which the frequent (sub-)sequences are computed
minSupport: The minimum support that makes a sequence frequent
Returns:
A list of tuples (s, c), where s is a frequent sequence, and c is the count for that sequence
"""
def prefixSpan(dataset, minSupport):
result = []
itemCounts = generateItemSupports(dataset)
for item, count in itemCounts:
if count >= minSupport:
newPrefix = [[item]]
result.append((newPrefix, count))
result.extend(prefixSpanInternal(projectDatabase(dataset, [item], False), minSupport, newPrefix))
return result
def prefixSpanInternal(dataset, minSupport, prevPrefixes=[]):
result = []
# Add a new item to the last element (==same time)
itemCountSameEvent = generateItemSupports(dataset, False, prefix=prevPrefixes[-1])
for item, count in itemCountSameEvent:
if (count >= minSupport) and item > prevPrefixes[-1][-1]:
newPrefix = copy.deepcopy(prevPrefixes)
newPrefix[-1].append(item)
result.append((newPrefix, count))
result.extend(prefixSpanInternal(projectDatabase(dataset, newPrefix[-1], False), minSupport, newPrefix))
# Add a new event to the prefix
itemCountSubsequentEvents = generateItemSupports(dataset, True)
for item, count in itemCountSubsequentEvents:
if count >= minSupport:
newPrefix = copy.deepcopy(prevPrefixes)
newPrefix.append([item])
result.append((newPrefix, count))
result.extend(prefixSpanInternal(projectDatabase(dataset, [item], True), minSupport, newPrefix))
return result
prefixSpan(dataset, 2)
[([['a']], 4), ([['a', 'b']], 2), ([['a', 'b'], ['c']], 2), ([['a', 'b'], ['c'], ['c']], 2), ([['a'], ['b']], 4), ([['a'], ['b', 'c']], 2), ([['a'], ['b'], ['c']], 3), ([['a'], ['c']], 4), ([['a'], ['c'], ['b']], 3), ([['a'], ['c'], ['b'], ['c']], 2), ([['a'], ['c'], ['c']], 4), ([['b']], 4), ([['b', 'c']], 2), ([['b'], ['c']], 3), ([['b'], ['c'], ['c']], 2), ([['c']], 4), ([['c'], ['b']], 3), ([['c'], ['b'], ['c']], 2), ([['c'], ['c']], 4)]
"""
Given a list of all frequent sequences and their counts, compute the set of closed frequent sequence (as a list)
This is only a very simplistic (naive) implementation for demonstration purposes!
"""
def filterClosed(result):
for supersequence, countSeq in copy.deepcopy(result):
for subsequence, countSubSeq in copy.deepcopy(result):
if isSubsequence(supersequence, subsequence) and (countSeq == countSubSeq) and subsequence != supersequence:
result.remove((subsequence, countSubSeq))
result = prefixSpan(dataset, 2)
filterClosed(result)
result
[([['a', 'b'], ['c'], ['c']], 2), ([['a'], ['b']], 4), ([['a'], ['b', 'c']], 2), ([['a'], ['b'], ['c']], 3), ([['a'], ['c'], ['b']], 3), ([['a'], ['c'], ['b'], ['c']], 2), ([['a'], ['c'], ['c']], 4)]
"""
Given a list of all frequent sequences and their counts, compute the set of maximal frequent sequence (as a list)
This is only a very naive implementation for demonstration purposes!
"""
def filterMaximal(result):
for supersequence, countSeq in copy.deepcopy(result):
for subsequence, countSubSeq in copy.deepcopy(result):
if isSubsequence (supersequence, subsequence) and subsequence != supersequence:
result.remove((subsequence, countSubSeq))
result = prefixSpan (dataset, 2)
filterMaximal(result)
result
[([['a', 'b'], ['c'], ['c']], 2), ([['a'], ['b', 'c']], 2), ([['a'], ['c'], ['b'], ['c']], 2)]
Now we try to find some sequential patterns in a real world dataset: Wikispeedia
First load the dataset.
import csv
sequences = []
# from https://snap.stanford.edu/data/wikispeedia.html
for line in csv.reader((row for row in open("data/paths_finished.tsv") if not row.startswith('#')), delimiter='\t'):
if len(line) == 0:
continue
# for simplicity, let us remove back clicks
seq = line[3].split(";")
# for simplicity, let us remove back clicks
seq = [x for x in seq if x != "<"]
sequences.append(seq)
Convert this to the list of list of lists that we use as a dataformat
wikispeediaData=[]
for seq in sequences:
newSeq = []
for item in seq:
newSeq.append([item])
wikispeediaData.append(newSeq)
%time prefixSpan (wikispeediaData, 1000)
Wall time: 46.3 s
[([['Africa']], 2738), ([['Agriculture']], 1147), ([['Animal']], 1666), ([['Asia']], 1167), ([['Asteroid']], 1171), ([['Asteroid'], ['Viking']], 1043), ([['Atlantic_Ocean']], 1297), ([['Bird']], 1182), ([['Brain']], 1316), ([['Brain'], ['Telephone']], 1043), ([['China']], 1110), ([['Christianity']], 1074), ([['Computer']], 1528), ([['Earth']], 3176), ([['England']], 3261), ([['English_language']], 1414), ([['Europe']], 4303), ([['France']], 1588), ([['Germany']], 1738), ([['Human']], 1604), ([['India']], 1216), ([['Internet']], 1023), ([['Japan']], 1070), ([['Mammal']], 1568), ([['North_America']], 1861), ([['Periodic_table']], 1394), ([['Plant']], 1127), ([['Russia']], 1007), ([['Science']], 1479), ([['Telephone']], 1251), ([['Theatre']], 1034), ([['United_Kingdom']], 3807), ([['United_Nations']], 1050), ([['United_States']], 8675), ([['Viking']], 1192), ([['World_War_II']], 2267), ([['Zebra']], 1041)]
This will take a long time:
# %time apriori(wikispeediaData, 1000)