# Preliminaries¶

In [1]:
import copy
import itertools
from collections import defaultdict
from operator import itemgetter


#### Our dataset format¶

An event is a list of strings.

A sequence is a list of events.

A dataset is a list of sequences.

Thus, a dataset is a list of lists of lists of strings.

In [2]:
dataset =  [
[["a"], ["a", "b", "c"], ["a", "c"], ["c"]],
[["a"], ["c"], ["b", "c"]],
[["a", "b"], ["d"], ["c"], ["b"], ["c"]],
[["a"], ["c"], ["b"], ["c"]]
]


# Foundations¶

### Subsequences¶

In [3]:
"""
This is a simple recursive method that checks if subsequence is a subSequence of mainSequence
"""
def isSubsequence(mainSequence, subSequence):
subSequenceClone = list(subSequence) # clone the sequence, because we will alter it
return isSubsequenceRecursive(mainSequence, subSequenceClone) #start recursion

"""
Function for the recursive call of isSubsequence, not intended for external calls
"""
def isSubsequenceRecursive(mainSequence, subSequenceClone, start=0):
# Check if empty: End of recursion, all itemsets have been found
if (not subSequenceClone):
return True
# retrieves element of the subsequence and removes is from subsequence
firstElem = set(subSequenceClone.pop(0))
# Search for the first itemset...
for i in range(start, len(mainSequence)):
if (set(mainSequence[i]).issuperset(firstElem)):
# and recurse
return isSubsequenceRecursive(mainSequence, subSequenceClone, i + 1)
return False

In [4]:
aSequence = [["a"], ["b", "c"], ["d"], ["a", "e"]]

In [5]:
isSubsequence(aSequence, [["a"], ["d"], ["e"]])

Out[5]:
True
In [6]:
isSubsequence(aSequence, [["a"], ["b", "c"], ["e"]])

Out[6]:
True
In [7]:
isSubsequence(aSequence, [["a"], ["b", "d"]])

Out[7]:
False

### Length of an itemset¶

In [8]:
"""
Computes the length of the sequence (sum of the length of the contained itemsets)
"""
def sequenceLength(sequence):
return sum(len(i) for i in sequence)

In [9]:
print sequenceLength ([["a"], ["b", "c"], ["a"], ["b","c","d"]])

7


### Support of a sequence¶

In [10]:
"""
Computes the support of a sequence in a dataset
"""
def countSupport (dataset, candidateSequence):
return sum(1 for seq in dataset if isSubsequence(seq, candidateSequence))

In [11]:
dataset

Out[11]:
[[['a'], ['a', 'b', 'c'], ['a', 'c'], ['c']],
[['a'], ['c'], ['b', 'c']],
[['a', 'b'], ['d'], ['c'], ['b'], ['c']],
[['a'], ['c'], ['b'], ['c']]]
In [12]:
countSupport(dataset, [["b"]])

Out[12]:
4
In [13]:
countSupport(dataset, [["a"], ["b", "c"]])

Out[13]:
2

# AprioriAll¶

### 1 . Candidate Generation¶

#### For a single pair:¶

In [14]:
"""
Generates one candidate of length k from two candidates of length (k-1) as used in the AprioriAll algorithm
"""
def generateCandidatesForPair(cand1, cand2):
cand1Clone = copy.deepcopy(cand1)
cand2Clone = copy.deepcopy(cand2)
# drop the leftmost item from cand1:
if (len (cand1[0]) == 1):
cand1Clone.pop(0)
else:
cand1Clone[0] = cand1Clone[0][1:]
# drop the rightmost item from cand2:
if (len (cand2[-1]) == 1):
cand2Clone.pop(-1)
else:
cand2Clone[-1] = cand2Clone[-1][:-1]

# if the result is not the same, then we dont need to join
if not cand1Clone == cand2Clone:
return []
else:
newCandidate = copy.deepcopy(cand1)
if (len (cand2[-1]) == 1):
newCandidate.append(cand2[-1])
else:
newCandidate [-1].extend(cand2[-1][-1])
return newCandidate

In [15]:
candidateA = [["a"], ["b", "c"], ["d"]]
candidateB = [["b", "c"], ["d", "e"]]
generateCandidatesForPair(candidateA, candidateB)

Out[15]:
[['a'], ['b', 'c'], ['d', 'e']]
In [16]:
candidateA = [["a"], ["b", "c"], ["d"]]
candidateC = [["b", "c"], ["d"], ["e"]]
generateCandidatesForPair(candidateA, candidateC)

Out[16]:
[['a'], ['b', 'c'], ['d'], ['e']]
In [17]:
candidateA = [["a"], ["b", "c"], ["d"]]
candidateD = [["a"], ["b", "c"], ["e"]]
generateCandidatesForPair(candidateA, candidateD)

Out[17]:
[]

#### For a set of candidates (of the last level):¶

In [18]:
"""
Generates the set of candidates of length k from the set of frequent sequences with length (k-1)
"""
def generateCandidates(lastLevelCandidates):
k = sequenceLength(lastLevelCandidates[0]) + 1
if (k == 2):
flatShortCandidates = [item for sublist2 in lastLevelCandidates for sublist1 in sublist2 for item in sublist1]
result = [[[a, b]] for a in flatShortCandidates for b in flatShortCandidates if b > a]
result.extend([[[a], [b]] for a in flatShortCandidates for b in flatShortCandidates])
return result
else:
candidates = []
for i in range(0, len(lastLevelCandidates)):
for j in range(0, len(lastLevelCandidates)):
newCand = generateCandidatesForPair(lastLevelCandidates[i], lastLevelCandidates[j])
if (not newCand == []):
candidates.append(newCand)
candidates.sort()
return candidates


An example; Lets assume, we know the frequent sequences of level 2:

In [19]:
lastLevelFrequentPatterns = [
[['a', 'b']],
[['b', 'c']],
[['a'], ['b']],
[['a'], ['c']],
[['b'], ['c']],
[['c'], ['b']],
[['c'], ['c']],
]


Then we can compute the generate candidates for level 3

In [20]:
newCandidates = generateCandidates(lastLevelFrequentPatterns)
newCandidates

Out[20]:
[[['a'], ['b'], ['c']],
[['a'], ['b', 'c']],
[['a'], ['c'], ['b']],
[['a'], ['c'], ['c']],
[['a', 'b'], ['c']],
[['a', 'b', 'c']],
[['b'], ['c'], ['b']],
[['b'], ['c'], ['c']],
[['b', 'c'], ['b']],
[['b', 'c'], ['c']],
[['c'], ['b'], ['c']],
[['c'], ['b', 'c']],
[['c'], ['c'], ['b']],
[['c'], ['c'], ['c']]]

### 2 . Candidate Checking¶

In [21]:
"""
Computes all direct subsequence for a given sequence.
A direct subsequence is any sequence that originates from deleting exactly one item from any event in the original sequence.
"""
def generateDirectSubsequences(sequence):
result = []
for i, itemset in enumerate(sequence):
if (len(itemset) == 1):
sequenceClone = copy.deepcopy(sequence)
sequenceClone.pop(i)
result.append(sequenceClone)
else:
for j in range(len(itemset)):
sequenceClone = copy.deepcopy(sequence)
sequenceClone[i].pop(j)
result.append(sequenceClone)
return result

In [22]:
"""
Prunes the set of candidates generated for length k given all frequent sequence of level (k-1), as done in AprioriAll
"""
def pruneCandidates(candidatesLastLevel, candidatesGenerated):
return [cand for cand in candidatesGenerated if all(x in candidatesLastLevel for x in generateDirectSubsequences(cand))]


We apply this on example dataset

In [23]:
candidatesPruned = pruneCandidates(lastLevelFrequentPatterns, newCandidates)
candidatesPruned

Out[23]:
[[['a'], ['b'], ['c']],
[['a'], ['b', 'c']],
[['a'], ['c'], ['b']],
[['a'], ['c'], ['c']],
[['a', 'b'], ['c']],
[['b'], ['c'], ['c']],
[['b', 'c'], ['c']],
[['c'], ['b'], ['c']],
[['c'], ['b', 'c']],
[['c'], ['c'], ['b']],
[['c'], ['c'], ['c']]]

### 3. Count Candidates (and filter not frequent ones):¶

In [24]:
minSupport = 2
candidatesCounts = [(i, countSupport(dataset, i)) for i in candidatesPruned]
resultLvl = [(i, count) for (i, count) in candidatesCounts if (count >= minSupport)]
resultLvl

Out[24]:
[([['a'], ['b'], ['c']], 3),
([['a'], ['b', 'c']], 2),
([['a'], ['c'], ['b']], 3),
([['a'], ['c'], ['c']], 4),
([['a', 'b'], ['c']], 2),
([['b'], ['c'], ['c']], 2),
([['c'], ['b'], ['c']], 2)]

### Put it all together:¶

In [25]:
"""
The AprioriAll algorithm. Computes the frequent sequences in a seqeunce dataset for a given minSupport

Args:
dataset: A list of sequences, for which the frequent (sub-)sequences are computed
minSupport: The minimum support that makes a sequence frequent
verbose: If true, additional information on the mining process is printed (i.e., candidates on each level)
Returns:
A list of tuples (s, c), where s is a frequent sequence, and c is the count for that sequence
"""
def apriori(dataset, minSupport, verbose=False):
global numberOfCountingOperations
numberOfCountingOperations = 0
Overall = []
itemsInDataset = sorted(set ([item for sublist1 in dataset for sublist2 in sublist1 for item in sublist2]))
singleItemSequences = [[[item]] for item in itemsInDataset]
singleItemCounts = [(i, countSupport(dataset, i)) for i in singleItemSequences if countSupport(dataset, i) >= minSupport]
Overall.append(singleItemCounts)
print "Result, lvl 1: " + str(Overall[0])
k = 1
while (True):
if not Overall [k - 1]:
break
# 1. Candidate generation
candidatesLastLevel = [x[0] for x in Overall[k - 1]]
candidatesGenerated = generateCandidates (candidatesLastLevel)
# 2. Candidate pruning (using a "containsall" subsequences)
candidatesPruned = [cand for cand in candidatesGenerated if all(x in candidatesLastLevel for x in generateDirectSubsequences(cand))]
# 3. Candidate checking
candidatesCounts = [(i, countSupport(dataset, i)) for i in candidatesPruned]
resultLvl = [(i, count) for (i, count) in candidatesCounts if (count >= minSupport)]
if verbose:
print "Candidates generated, lvl " + str(k + 1) + ": " + str(candidatesGenerated)
print "Candidates pruned, lvl " + str(k + 1) + ": " + str(candidatesPruned)
print "Result, lvl " + str(k + 1) + ": " + str(resultLvl)
Overall.append(resultLvl)
k = k + 1
# "flatten" Overall
Overall = Overall [:-1]
Overall = [item for sublist in Overall for item in sublist]
return Overall

In [26]:
apriori(dataset, 2, verbose=False)

Result, lvl 1: [([['a']], 4), ([['b']], 4), ([['c']], 4)]

Out[26]:
[([['a']], 4),
([['b']], 4),
([['c']], 4),
([['a', 'b']], 2),
([['b', 'c']], 2),
([['a'], ['b']], 4),
([['a'], ['c']], 4),
([['b'], ['c']], 3),
([['c'], ['b']], 3),
([['c'], ['c']], 4),
([['a'], ['b'], ['c']], 3),
([['a'], ['b', 'c']], 2),
([['a'], ['c'], ['b']], 3),
([['a'], ['c'], ['c']], 4),
([['a', 'b'], ['c']], 2),
([['b'], ['c'], ['c']], 2),
([['c'], ['b'], ['c']], 2),
([['a'], ['c'], ['b'], ['c']], 2),
([['a', 'b'], ['c'], ['c']], 2)]

# PrefixSpan¶

### Project a sequence¶

In [27]:
"""
Projects a sequence according to a given prefix, as done in PrefixSpan

Args:
sequence: the sequence the projection is built from
prefix: the prefix that is searched for in the sequence
newEvent: if set to True, the first itemset is ignored
Returns:
If the sequence does not contain the prefix, then None.
Otherwise, a new sequence starting from the position of the prefix, including the itemset that includes the prefix
"""
def projectSequence(sequence, prefix, newEvent):
result = None
for i, itemset in enumerate(sequence):
if result is None:
if (not newEvent) or i > 0:
if (all(x in itemset for x in prefix)):
result = [list(itemset)]
else:
result.append(copy.copy(itemset))
return result

In [28]:
seq = [["a"], ["b", "c"], ["a", "c"], ["c"]]
projectSequence(seq, ["b"], False)

Out[28]:
[['b', 'c'], ['a', 'c'], ['c']]
In [29]:
projectSequence(seq, ["a", "c"], False)

Out[29]:
[['a', 'c'], ['c']]
In [30]:
projectSequence(seq, ["a"], False)

Out[30]:
[['a'], ['b', 'c'], ['a', 'c'], ['c']]
In [31]:
projectSequence(seq, ["a"], True)

Out[31]:
[['a', 'c'], ['c']]

### Project a dataset¶

In [32]:
"""
Projects a dataset according to a given prefix, as done in PrefixSpan

Args:
dataset: the dataset the projection is built from
prefix: the prefix that is searched for in the sequence
newEvent: if set to True, the first itemset is ignored
Returns:
A (potentially empty) list of sequences
"""
def projectDatabase(dataset, prefix, newEvent):
projectedDB = []
for sequence in dataset:
seqProjected = projectSequence(sequence, prefix, newEvent)
if not seqProjected is None:
projectedDB.append(seqProjected)
return projectedDB

In [33]:
datasetProject = [
[["a"], ["a", "b", "c"], ["a", "c"], ["d"], ["c", "f"]],
[["a", "d"], ["c"], ["b", "c"], ["a", "e"]],
[["e", "f"], ["a", "b"], ["d", "f"], ["d"], ["b"]],
[["e"], ["g"], ["a", "f"], ["c"], ["b"], ["c"]]
]

In [34]:
projectDatabase(datasetProject, ["c"], False)

Out[34]:
[[['a', 'b', 'c'], ['a', 'c'], ['d'], ['c', 'f']],
[['c'], ['b', 'c'], ['a', 'e']],
[['c'], ['b'], ['c']]]

### The main algorithm¶

#### Some more utility functions:¶

In [35]:
"""
Generates a list of all items that are contained in a dataset
"""
def generateItems(dataset):
return sorted(set ([item for sublist1 in dataset for sublist2 in sublist1 for item in sublist2]))

"""
Computes a defaultdict that maps each item in the dataset to its support
"""
def generateItemSupports(dataset, ignoreFirstEvent=False, prefix=[]):
result = defaultdict(int)
for sequence in dataset:
if ignoreFirstEvent:
sequence = sequence[1:]
cooccurringItems = set()
for itemset in sequence:
if all(x in itemset for x in prefix):
for item in itemset:
if not item in prefix:
for item in cooccurringItems:
result [item] += 1
return sorted(result.items())


#### Finally, the algorithm:¶

In [36]:
"""
The PrefixSpan algorithm. Computes the frequent sequences in a seqeunce dataset for a given minSupport

Args:
dataset: A list of sequences, for which the frequent (sub-)sequences are computed
minSupport: The minimum support that makes a sequence frequent
Returns:
A list of tuples (s, c), where s is a frequent sequence, and c is the count for that sequence
"""
def prefixSpan(dataset, minSupport):
result = []
itemCounts = generateItemSupports(dataset)
for item, count in itemCounts:
if count >= minSupport:
newPrefix = [[item]]
result.append((newPrefix, count))
result.extend(prefixSpanInternal(projectDatabase(dataset, [item], False), minSupport, newPrefix))
return result

def prefixSpanInternal(dataset, minSupport, prevPrefixes=[]):
result = []

# Add a new item to the last element (==same time)
itemCountSameEvent = generateItemSupports(dataset, False, prefix=prevPrefixes[-1])
for item, count in itemCountSameEvent:
if (count >= minSupport) and item > prevPrefixes[-1][-1]:
newPrefix = copy.deepcopy(prevPrefixes)
newPrefix[-1].append(item)
result.append((newPrefix, count))
result.extend(prefixSpanInternal(projectDatabase(dataset, newPrefix[-1], False), minSupport, newPrefix))

# Add a new event to the prefix
itemCountSubsequentEvents = generateItemSupports(dataset, True)
for item, count in itemCountSubsequentEvents:
if count >= minSupport:
newPrefix = copy.deepcopy(prevPrefixes)
newPrefix.append([item])
result.append((newPrefix, count))
result.extend(prefixSpanInternal(projectDatabase(dataset, [item], True), minSupport, newPrefix))
return result

In [37]:
prefixSpan(dataset, 2)

Out[37]:
[([['a']], 4),
([['a', 'b']], 2),
([['a', 'b'], ['c']], 2),
([['a', 'b'], ['c'], ['c']], 2),
([['a'], ['b']], 4),
([['a'], ['b', 'c']], 2),
([['a'], ['b'], ['c']], 3),
([['a'], ['c']], 4),
([['a'], ['c'], ['b']], 3),
([['a'], ['c'], ['b'], ['c']], 2),
([['a'], ['c'], ['c']], 4),
([['b']], 4),
([['b', 'c']], 2),
([['b'], ['c']], 3),
([['b'], ['c'], ['c']], 2),
([['c']], 4),
([['c'], ['b']], 3),
([['c'], ['b'], ['c']], 2),
([['c'], ['c']], 4)]

# Filter for closed and maximal patterns¶

### Closed patterns¶

In [38]:
"""
Given a list of all frequent sequences and their counts, compute the set of closed frequent sequence (as a list)
This is only a very simplistic (naive) implementation for demonstration purposes!
"""
def filterClosed(result):
for supersequence, countSeq in copy.deepcopy(result):
for subsequence, countSubSeq in copy.deepcopy(result):
if isSubsequence(supersequence, subsequence) and (countSeq == countSubSeq) and subsequence != supersequence:
result.remove((subsequence, countSubSeq))

In [39]:
result = prefixSpan(dataset, 2)
filterClosed(result)
result

Out[39]:
[([['a', 'b'], ['c'], ['c']], 2),
([['a'], ['b']], 4),
([['a'], ['b', 'c']], 2),
([['a'], ['b'], ['c']], 3),
([['a'], ['c'], ['b']], 3),
([['a'], ['c'], ['b'], ['c']], 2),
([['a'], ['c'], ['c']], 4)]

### Maximal sequences¶

In [40]:
"""
Given a list of all frequent sequences and their counts, compute the set of maximal frequent sequence (as a list)
This is only a very naive implementation for demonstration purposes!
"""
def filterMaximal(result):
for supersequence, countSeq in copy.deepcopy(result):
for subsequence, countSubSeq in copy.deepcopy(result):
if isSubsequence (supersequence, subsequence) and subsequence != supersequence:
result.remove((subsequence, countSubSeq))

In [41]:
result = prefixSpan (dataset, 2)
filterMaximal(result)
result

Out[41]:
[([['a', 'b'], ['c'], ['c']], 2),
([['a'], ['b', 'c']], 2),
([['a'], ['c'], ['b'], ['c']], 2)]

# Application example: Wikispeedia¶

Now we try to find some sequential patterns in a real world dataset: Wikispeedia

In [42]:
import csv
sequences = []

# from https://snap.stanford.edu/data/wikispeedia.html
for line in csv.reader((row for row in open("data/paths_finished.tsv") if not row.startswith('#')), delimiter='\t'):
if len(line) == 0:
continue
# for simplicity, let us remove back clicks
seq = line[3].split(";")
# for simplicity, let us remove back clicks
seq = [x for x in seq if x != "<"]
sequences.append(seq)


Convert this to the list of list of lists that we use as a dataformat

In [43]:
wikispeediaData=[]
for seq in sequences:
newSeq = []
for item in seq:
newSeq.append([item])

In [44]:
%time prefixSpan (wikispeediaData, 1000)

Wall time: 46.3 s

Out[44]:
[([['Africa']], 2738),
([['Agriculture']], 1147),
([['Animal']], 1666),
([['Asia']], 1167),
([['Asteroid']], 1171),
([['Asteroid'], ['Viking']], 1043),
([['Atlantic_Ocean']], 1297),
([['Bird']], 1182),
([['Brain']], 1316),
([['Brain'], ['Telephone']], 1043),
([['China']], 1110),
([['Christianity']], 1074),
([['Computer']], 1528),
([['Earth']], 3176),
([['England']], 3261),
([['English_language']], 1414),
([['Europe']], 4303),
([['France']], 1588),
([['Germany']], 1738),
([['Human']], 1604),
([['India']], 1216),
([['Internet']], 1023),
([['Japan']], 1070),
([['Mammal']], 1568),
([['North_America']], 1861),
([['Periodic_table']], 1394),
([['Plant']], 1127),
([['Russia']], 1007),
([['Science']], 1479),
([['Telephone']], 1251),
([['Theatre']], 1034),
([['United_Kingdom']], 3807),
([['United_Nations']], 1050),
([['United_States']], 8675),
([['Viking']], 1192),
([['World_War_II']], 2267),
([['Zebra']], 1041)]

This will take a long time:

In [ ]:
# %time apriori(wikispeediaData, 1000)