Preliminaries¶

In [1]:

import copy
import itertools
from collections import defaultdict
from operator import itemgetter

Our dataset format¶

An event is a list of strings.

A sequence is a list of events.

A dataset is a list of sequences.

Thus, a dataset is a list of lists of lists of strings.

In [2]:

dataset =  [
    [["a"], ["a", "b", "c"], ["a", "c"], ["c"]],
    [["a"], ["c"], ["b", "c"]],
    [["a", "b"], ["d"], ["c"], ["b"], ["c"]],
    [["a"], ["c"], ["b"], ["c"]]
]

Foundations¶

Subsequences¶

In [3]:

"""
This is a simple recursive method that checks if subsequence is a subSequence of mainSequence
"""
def isSubsequence(mainSequence, subSequence):
    subSequenceClone = list(subSequence) # clone the sequence, because we will alter it
    return isSubsequenceRecursive(mainSequence, subSequenceClone) #start recursion

"""
Function for the recursive call of isSubsequence, not intended for external calls
"""
def isSubsequenceRecursive(mainSequence, subSequenceClone, start=0):
    # Check if empty: End of recursion, all itemsets have been found
    if (not subSequenceClone):
        return True
    # retrieves element of the subsequence and removes is from subsequence 
    firstElem = set(subSequenceClone.pop(0))
    # Search for the first itemset...
    for i in range(start, len(mainSequence)):
        if (set(mainSequence[i]).issuperset(firstElem)):
            # and recurse
            return isSubsequenceRecursive(mainSequence, subSequenceClone, i + 1)
    return False

In [4]:

aSequence = [["a"], ["b", "c"], ["d"], ["a", "e"]]

In [5]:

isSubsequence(aSequence, [["a"], ["d"], ["e"]])

Out[5]:

True

In [6]:

isSubsequence(aSequence, [["a"], ["b", "c"], ["e"]])

Out[6]:

True

In [7]:

isSubsequence(aSequence, [["a"], ["b", "d"]])

Out[7]:

False

Length of an itemset¶

In [8]:

"""
Computes the length of the sequence (sum of the length of the contained itemsets)
"""
def sequenceLength(sequence):
    return sum(len(i) for i in sequence)

In [9]:

print sequenceLength ([["a"], ["b", "c"], ["a"], ["b","c","d"]])

Support of a sequence¶

In [10]:

"""
Computes the support of a sequence in a dataset
"""
def countSupport (dataset, candidateSequence):
    return sum(1 for seq in dataset if isSubsequence(seq, candidateSequence)) 

In [11]:

dataset

Out[11]:

[[['a'], ['a', 'b', 'c'], ['a', 'c'], ['c']],
 [['a'], ['c'], ['b', 'c']],
 [['a', 'b'], ['d'], ['c'], ['b'], ['c']],
 [['a'], ['c'], ['b'], ['c']]]

In [12]:

countSupport(dataset, [["b"]])

Out[12]:

In [13]:

countSupport(dataset, [["a"], ["b", "c"]])

Out[13]:

AprioriAll¶

1 . Candidate Generation¶

For a single pair:¶

In [14]:

"""
Generates one candidate of length k from two candidates of length (k-1) as used in the AprioriAll algorithm
"""
def generateCandidatesForPair(cand1, cand2):
    cand1Clone = copy.deepcopy(cand1)
    cand2Clone = copy.deepcopy(cand2)
    # drop the leftmost item from cand1:
    if (len (cand1[0]) == 1):
        cand1Clone.pop(0)
    else:
        cand1Clone[0] = cand1Clone[0][1:]
    # drop the rightmost item from cand2:
    if (len (cand2[-1]) == 1):
        cand2Clone.pop(-1)
    else:
        cand2Clone[-1] = cand2Clone[-1][:-1]
    
    # if the result is not the same, then we dont need to join
    if not cand1Clone == cand2Clone:
        return []
    else:
        newCandidate = copy.deepcopy(cand1)
        if (len (cand2[-1]) == 1):
            newCandidate.append(cand2[-1])
        else:
            newCandidate [-1].extend(cand2[-1][-1])
        return newCandidate

In [15]:

candidateA = [["a"], ["b", "c"], ["d"]]
candidateB = [["b", "c"], ["d", "e"]]
generateCandidatesForPair(candidateA, candidateB)

Out[15]:

[['a'], ['b', 'c'], ['d', 'e']]

In [16]:

candidateA = [["a"], ["b", "c"], ["d"]]
candidateC = [["b", "c"], ["d"], ["e"]]
generateCandidatesForPair(candidateA, candidateC)

Out[16]:

[['a'], ['b', 'c'], ['d'], ['e']]

In [17]:

candidateA = [["a"], ["b", "c"], ["d"]]
candidateD = [["a"], ["b", "c"], ["e"]]
generateCandidatesForPair(candidateA, candidateD)

Out[17]:

[]

For a set of candidates (of the last level):¶

In [18]:

"""
Generates the set of candidates of length k from the set of frequent sequences with length (k-1)
"""
def generateCandidates(lastLevelCandidates):
    k = sequenceLength(lastLevelCandidates[0]) + 1
    if (k == 2):
        flatShortCandidates = [item for sublist2 in lastLevelCandidates for sublist1 in sublist2 for item in sublist1]
        result = [[[a, b]] for a in flatShortCandidates for b in flatShortCandidates if b > a]
        result.extend([[[a], [b]] for a in flatShortCandidates for b in flatShortCandidates])
        return result
    else:
        candidates = []
        for i in range(0, len(lastLevelCandidates)):
            for j in range(0, len(lastLevelCandidates)):
                newCand = generateCandidatesForPair(lastLevelCandidates[i], lastLevelCandidates[j])
                if (not newCand == []):
                    candidates.append(newCand)
        candidates.sort()
        return candidates

An example; Lets assume, we know the frequent sequences of level 2:

In [19]:

lastLevelFrequentPatterns = [
    [['a', 'b']], 
    [['b', 'c']], 
    [['a'], ['b']], 
    [['a'], ['c']], 
    [['b'], ['c']],
    [['c'], ['b']], 
    [['c'], ['c']],  
]

Then we can compute the generate candidates for level 3

In [20]:

newCandidates = generateCandidates(lastLevelFrequentPatterns)
newCandidates

Out[20]:

[[['a'], ['b'], ['c']],
 [['a'], ['b', 'c']],
 [['a'], ['c'], ['b']],
 [['a'], ['c'], ['c']],
 [['a', 'b'], ['c']],
 [['a', 'b', 'c']],
 [['b'], ['c'], ['b']],
 [['b'], ['c'], ['c']],
 [['b', 'c'], ['b']],
 [['b', 'c'], ['c']],
 [['c'], ['b'], ['c']],
 [['c'], ['b', 'c']],
 [['c'], ['c'], ['b']],
 [['c'], ['c'], ['c']]]

2 . Candidate Checking¶

In [21]:

"""
Computes all direct subsequence for a given sequence.
A direct subsequence is any sequence that originates from deleting exactly one item from any event in the original sequence.
"""
def generateDirectSubsequences(sequence):
    result = []
    for i, itemset in enumerate(sequence):
        if (len(itemset) == 1):
            sequenceClone = copy.deepcopy(sequence)
            sequenceClone.pop(i)
            result.append(sequenceClone)
        else:
            for j in range(len(itemset)):
                sequenceClone = copy.deepcopy(sequence)
                sequenceClone[i].pop(j)
                result.append(sequenceClone)
    return result

In [22]:

"""
Prunes the set of candidates generated for length k given all frequent sequence of level (k-1), as done in AprioriAll
"""
def pruneCandidates(candidatesLastLevel, candidatesGenerated):
    return [cand for cand in candidatesGenerated if all(x in candidatesLastLevel for x in generateDirectSubsequences(cand))]

We apply this on example dataset

In [23]:

candidatesPruned = pruneCandidates(lastLevelFrequentPatterns, newCandidates)
candidatesPruned

Out[23]:

[[['a'], ['b'], ['c']],
 [['a'], ['b', 'c']],
 [['a'], ['c'], ['b']],
 [['a'], ['c'], ['c']],
 [['a', 'b'], ['c']],
 [['b'], ['c'], ['c']],
 [['b', 'c'], ['c']],
 [['c'], ['b'], ['c']],
 [['c'], ['b', 'c']],
 [['c'], ['c'], ['b']],
 [['c'], ['c'], ['c']]]

3. Count Candidates (and filter not frequent ones):¶

In [24]:

minSupport = 2
candidatesCounts = [(i, countSupport(dataset, i)) for i in candidatesPruned]
resultLvl = [(i, count) for (i, count) in candidatesCounts if (count >= minSupport)]
resultLvl

Out[24]:

[([['a'], ['b'], ['c']], 3),
 ([['a'], ['b', 'c']], 2),
 ([['a'], ['c'], ['b']], 3),
 ([['a'], ['c'], ['c']], 4),
 ([['a', 'b'], ['c']], 2),
 ([['b'], ['c'], ['c']], 2),
 ([['c'], ['b'], ['c']], 2)]

Put it all together:¶

In [25]:

"""
The AprioriAll algorithm. Computes the frequent sequences in a seqeunce dataset for a given minSupport

Args:
    dataset: A list of sequences, for which the frequent (sub-)sequences are computed
    minSupport: The minimum support that makes a sequence frequent
    verbose: If true, additional information on the mining process is printed (i.e., candidates on each level)
Returns:
    A list of tuples (s, c), where s is a frequent sequence, and c is the count for that sequence
"""
def apriori(dataset, minSupport, verbose=False):
    global numberOfCountingOperations
    numberOfCountingOperations = 0
    Overall = []
    itemsInDataset = sorted(set ([item for sublist1 in dataset for sublist2 in sublist1 for item in sublist2]))
    singleItemSequences = [[[item]] for item in itemsInDataset]
    singleItemCounts = [(i, countSupport(dataset, i)) for i in singleItemSequences if countSupport(dataset, i) >= minSupport]
    Overall.append(singleItemCounts)
    print "Result, lvl 1: " + str(Overall[0])
    k = 1
    while (True):
        if not Overall [k - 1]:
            break
        # 1. Candidate generation
        candidatesLastLevel = [x[0] for x in Overall[k - 1]]
        candidatesGenerated = generateCandidates (candidatesLastLevel)
        # 2. Candidate pruning (using a "containsall" subsequences)
        candidatesPruned = [cand for cand in candidatesGenerated if all(x in candidatesLastLevel for x in generateDirectSubsequences(cand))]
        # 3. Candidate checking
        candidatesCounts = [(i, countSupport(dataset, i)) for i in candidatesPruned]
        resultLvl = [(i, count) for (i, count) in candidatesCounts if (count >= minSupport)]
        if verbose:
            print "Candidates generated, lvl " + str(k + 1) + ": " + str(candidatesGenerated)
            print "Candidates pruned, lvl " + str(k + 1) + ": " + str(candidatesPruned)
            print "Result, lvl " + str(k + 1) + ": " + str(resultLvl)
        Overall.append(resultLvl)
        k = k + 1
    # "flatten" Overall
    Overall = Overall [:-1]
    Overall = [item for sublist in Overall for item in sublist]
    return Overall

In [26]:

apriori(dataset, 2, verbose=False)

Result, lvl 1: [([['a']], 4), ([['b']], 4), ([['c']], 4)]

Out[26]:

[([['a']], 4),
 ([['b']], 4),
 ([['c']], 4),
 ([['a', 'b']], 2),
 ([['b', 'c']], 2),
 ([['a'], ['b']], 4),
 ([['a'], ['c']], 4),
 ([['b'], ['c']], 3),
 ([['c'], ['b']], 3),
 ([['c'], ['c']], 4),
 ([['a'], ['b'], ['c']], 3),
 ([['a'], ['b', 'c']], 2),
 ([['a'], ['c'], ['b']], 3),
 ([['a'], ['c'], ['c']], 4),
 ([['a', 'b'], ['c']], 2),
 ([['b'], ['c'], ['c']], 2),
 ([['c'], ['b'], ['c']], 2),
 ([['a'], ['c'], ['b'], ['c']], 2),
 ([['a', 'b'], ['c'], ['c']], 2)]

PrefixSpan¶

Project a sequence¶

In [27]:

"""
Projects a sequence according to a given prefix, as done in PrefixSpan

Args:
    sequence: the sequence the projection is built from
    prefix: the prefix that is searched for in the sequence
    newEvent: if set to True, the first itemset is ignored
Returns:
    If the sequence does not contain the prefix, then None.
    Otherwise, a new sequence starting from the position of the prefix, including the itemset that includes the prefix
"""
def projectSequence(sequence, prefix, newEvent):
    result = None
    for i, itemset in enumerate(sequence):
        if result is None:
            if (not newEvent) or i > 0:
                if (all(x in itemset for x in prefix)):
                    result = [list(itemset)]
        else:
            result.append(copy.copy(itemset))
    return result

In [28]:

seq = [["a"], ["b", "c"], ["a", "c"], ["c"]]
projectSequence(seq, ["b"], False)

Out[28]:

[['b', 'c'], ['a', 'c'], ['c']]

In [29]:

projectSequence(seq, ["a", "c"], False)

Out[29]:

[['a', 'c'], ['c']]

In [30]:

projectSequence(seq, ["a"], False)

Out[30]:

[['a'], ['b', 'c'], ['a', 'c'], ['c']]

In [31]:

projectSequence(seq, ["a"], True)

Out[31]:

[['a', 'c'], ['c']]

Project a dataset¶

In [32]:

"""
Projects a dataset according to a given prefix, as done in PrefixSpan

Args:
    dataset: the dataset the projection is built from
    prefix: the prefix that is searched for in the sequence
    newEvent: if set to True, the first itemset is ignored
Returns:
    A (potentially empty) list of sequences
"""
def projectDatabase(dataset, prefix, newEvent):
    projectedDB = []
    for sequence in dataset:
        seqProjected = projectSequence(sequence, prefix, newEvent)
        if not seqProjected is None:
            projectedDB.append(seqProjected)
    return projectedDB

In [33]:

datasetProject = [
            [["a"], ["a", "b", "c"], ["a", "c"], ["d"], ["c", "f"]],
            [["a", "d"], ["c"], ["b", "c"], ["a", "e"]],
            [["e", "f"], ["a", "b"], ["d", "f"], ["d"], ["b"]],
            [["e"], ["g"], ["a", "f"], ["c"], ["b"], ["c"]]
        ]

In [34]:

projectDatabase(datasetProject, ["c"], False)

Out[34]:

[[['a', 'b', 'c'], ['a', 'c'], ['d'], ['c', 'f']],
 [['c'], ['b', 'c'], ['a', 'e']],
 [['c'], ['b'], ['c']]]

The main algorithm¶

Some more utility functions:¶

In [35]:

"""
Generates a list of all items that are contained in a dataset
"""
def generateItems(dataset):
    return sorted(set ([item for sublist1 in dataset for sublist2 in sublist1 for item in sublist2]))

"""
Computes a defaultdict that maps each item in the dataset to its support
"""
def generateItemSupports(dataset, ignoreFirstEvent=False, prefix=[]):
    result = defaultdict(int)
    for sequence in dataset:
        if ignoreFirstEvent:
            sequence = sequence[1:]
        cooccurringItems = set()
        for itemset in sequence:
            if all(x in itemset for x in prefix):
                for item in itemset:
                    if not item in prefix:
                        cooccurringItems.add(item)
        for item in cooccurringItems:
            result [item] += 1
    return sorted(result.items())

Finally, the algorithm:¶

In [36]:

"""
The PrefixSpan algorithm. Computes the frequent sequences in a seqeunce dataset for a given minSupport

Args:
    dataset: A list of sequences, for which the frequent (sub-)sequences are computed
    minSupport: The minimum support that makes a sequence frequent
Returns:
    A list of tuples (s, c), where s is a frequent sequence, and c is the count for that sequence
"""
def prefixSpan(dataset, minSupport):
    result = []
    itemCounts = generateItemSupports(dataset)
    for item, count in itemCounts:
        if count >= minSupport:
            newPrefix = [[item]]
            result.append((newPrefix, count))
            result.extend(prefixSpanInternal(projectDatabase(dataset, [item], False), minSupport, newPrefix))
    return result

def prefixSpanInternal(dataset, minSupport, prevPrefixes=[]):
    result = []
    
    # Add a new item to the last element (==same time)
    itemCountSameEvent = generateItemSupports(dataset, False, prefix=prevPrefixes[-1])
    for item, count in itemCountSameEvent:
        if (count >= minSupport) and item > prevPrefixes[-1][-1]:
            newPrefix = copy.deepcopy(prevPrefixes)
            newPrefix[-1].append(item)
            result.append((newPrefix, count))
            result.extend(prefixSpanInternal(projectDatabase(dataset, newPrefix[-1], False), minSupport, newPrefix))
        
    # Add a new event to the prefix
    itemCountSubsequentEvents = generateItemSupports(dataset, True)
    for item, count in itemCountSubsequentEvents:
        if count >= minSupport:
            newPrefix = copy.deepcopy(prevPrefixes)
            newPrefix.append([item])
            result.append((newPrefix, count))
            result.extend(prefixSpanInternal(projectDatabase(dataset, [item], True), minSupport, newPrefix))
    return result

In [37]:

prefixSpan(dataset, 2)

Out[37]:

[([['a']], 4),
 ([['a', 'b']], 2),
 ([['a', 'b'], ['c']], 2),
 ([['a', 'b'], ['c'], ['c']], 2),
 ([['a'], ['b']], 4),
 ([['a'], ['b', 'c']], 2),
 ([['a'], ['b'], ['c']], 3),
 ([['a'], ['c']], 4),
 ([['a'], ['c'], ['b']], 3),
 ([['a'], ['c'], ['b'], ['c']], 2),
 ([['a'], ['c'], ['c']], 4),
 ([['b']], 4),
 ([['b', 'c']], 2),
 ([['b'], ['c']], 3),
 ([['b'], ['c'], ['c']], 2),
 ([['c']], 4),
 ([['c'], ['b']], 3),
 ([['c'], ['b'], ['c']], 2),
 ([['c'], ['c']], 4)]

Filter for closed and maximal patterns¶

Closed patterns¶

In [38]:

"""
Given a list of all frequent sequences and their counts, compute the set of closed frequent sequence (as a list)
This is only a very simplistic (naive) implementation for demonstration purposes!
"""
def filterClosed(result):
    for supersequence, countSeq in copy.deepcopy(result):
        for subsequence, countSubSeq in copy.deepcopy(result):
            if isSubsequence(supersequence, subsequence) and (countSeq == countSubSeq) and subsequence != supersequence:
                result.remove((subsequence, countSubSeq))

In [39]:

result = prefixSpan(dataset, 2)
filterClosed(result)
result

Out[39]:

[([['a', 'b'], ['c'], ['c']], 2),
 ([['a'], ['b']], 4),
 ([['a'], ['b', 'c']], 2),
 ([['a'], ['b'], ['c']], 3),
 ([['a'], ['c'], ['b']], 3),
 ([['a'], ['c'], ['b'], ['c']], 2),
 ([['a'], ['c'], ['c']], 4)]

Maximal sequences¶

In [40]:

"""
Given a list of all frequent sequences and their counts, compute the set of maximal frequent sequence (as a list)
This is only a very naive implementation for demonstration purposes!
"""
def filterMaximal(result):
    for supersequence, countSeq in copy.deepcopy(result):
        for subsequence, countSubSeq in copy.deepcopy(result):
            if isSubsequence (supersequence, subsequence) and subsequence != supersequence:
                result.remove((subsequence, countSubSeq)) 

In [41]:

result = prefixSpan (dataset, 2)
filterMaximal(result)
result

Out[41]:

[([['a', 'b'], ['c'], ['c']], 2),
 ([['a'], ['b', 'c']], 2),
 ([['a'], ['c'], ['b'], ['c']], 2)]

Application example: Wikispeedia¶

Now we try to find some sequential patterns in a real world dataset: Wikispeedia

First load the dataset.

In [42]:

import csv
sequences = []

# from https://snap.stanford.edu/data/wikispeedia.html
for line in csv.reader((row for row in open("data/paths_finished.tsv") if not row.startswith('#')), delimiter='\t'):
    if len(line) == 0:
        continue
    # for simplicity, let us remove back clicks
    seq = line[3].split(";")
    # for simplicity, let us remove back clicks
    seq = [x for x in seq if x != "<"]
    sequences.append(seq)

Convert this to the list of list of lists that we use as a dataformat

In [43]:

wikispeediaData=[]
for seq in sequences:
    newSeq = []
    for item in seq:
        newSeq.append([item])
    wikispeediaData.append(newSeq)

In [44]:

%time prefixSpan (wikispeediaData, 1000)

Wall time: 46.3 s

Out[44]:

[([['Africa']], 2738),
 ([['Agriculture']], 1147),
 ([['Animal']], 1666),
 ([['Asia']], 1167),
 ([['Asteroid']], 1171),
 ([['Asteroid'], ['Viking']], 1043),
 ([['Atlantic_Ocean']], 1297),
 ([['Bird']], 1182),
 ([['Brain']], 1316),
 ([['Brain'], ['Telephone']], 1043),
 ([['China']], 1110),
 ([['Christianity']], 1074),
 ([['Computer']], 1528),
 ([['Earth']], 3176),
 ([['England']], 3261),
 ([['English_language']], 1414),
 ([['Europe']], 4303),
 ([['France']], 1588),
 ([['Germany']], 1738),
 ([['Human']], 1604),
 ([['India']], 1216),
 ([['Internet']], 1023),
 ([['Japan']], 1070),
 ([['Mammal']], 1568),
 ([['North_America']], 1861),
 ([['Periodic_table']], 1394),
 ([['Plant']], 1127),
 ([['Russia']], 1007),
 ([['Science']], 1479),
 ([['Telephone']], 1251),
 ([['Theatre']], 1034),
 ([['United_Kingdom']], 3807),
 ([['United_Nations']], 1050),
 ([['United_States']], 8675),
 ([['Viking']], 1192),
 ([['World_War_II']], 2267),
 ([['Zebra']], 1041)]

This will take a long time:

In [ ]:

# %time apriori(wikispeediaData, 1000)