In [18]:
import numpy as np
from fastcluster import linkage
from scipy.spatial.distance import squareform

distance_matrix = np.array([[0.0, .25, .25, .25],
                            [.25, 0.0, .25, .25],
                            [.25, .25, 0.0, .25],
                            [.25, .25, .25, 0.0]])
In [19]:
distance_matrix
Out[19]:
array([[ 0.  ,  0.25,  0.25,  0.25],
       [ 0.25,  0.  ,  0.25,  0.25],
       [ 0.25,  0.25,  0.  ,  0.25],
       [ 0.25,  0.25,  0.25,  0.  ]])
In [20]:
compressed_distance_matrix = squareform(distance_matrix)
Z = linkage(compressed_distance_matrix, method="complete", metric="cosine")
In [21]:
Z
Out[21]:
array([[ 0.  ,  1.  ,  0.25,  2.  ],
       [ 2.  ,  4.  ,  0.25,  3.  ],
       [ 3.  ,  5.  ,  0.25,  4.  ]])
In [28]:
# Num desired clusters
k = 2

# Count entries in the matrix
initial_count = len(distance_matrix[0])

# Fill an array with indices from the matrix
c = [[i] for i in range(initial_count)]
# Now extend with empty entries for merging into
c.extend([[]]*(initial_count))
In [38]:
# Loop the linkage matrix, making merges
# into new clusters until we have k clusters.
for i, block in enumerate(Z):
    j = i + initial_count
    a = int(block[0])
    b = int(block[1])

    # don't cluster randomly.
    if block[2] >= 1:
        break

    # merge a and b into j
    c[j] = c[a]
    c[j].extend(c[b])
    # empty a and b
    c[a] = c[b] = []
    
    # count clusters and see if we have
    # reached our target number
    cluster_count = sum([min(len(x),1) for x in c])
    if cluster_count <= k:
        break
In [37]:
clusters = [x for x in c if x]
clusters
Out[37]:
[[3], [2, 0, 1]]
In [ ]:
# I have 2 clusters of size 1 and 3, but I'd like 2 and 2
# More generally I want to cluster evenly where possible for larger arrays.
# If  I have a small k it will then start combining those clusters (also evenly).