python – k means, does it guarantee exactly k clusters to be returned?


I have implemented k means as below, and as far as I know, it doesn’t guarantee k grouping when given k. But the task I have which is https://justpaste.it/5cmte, requires me to return exactly k clusters. How should I go around that?

K means:

import sys, random, math
from collections import defaultdict

def pointAvg(points):
    dimensions = len(points(0))
    newCenter = ()
    for dimension in range(dimensions):
        dimSum = 0
        for p in points:
            dimSum += p(dimension)
        newCenter.append(dimSum / float(len(points)))
    return newCenter

def updateCenters(dataSet, assignments):
    newMeans = defaultdict(list)
    centers = ()
    for assignment, point in zip(assignments, dataSet):
        newMeans(assignment).append(point)
    for points in newMeans.values():
        centers.append(pointAvg(points))
    return centers, newMeans
    
def assignPoints(dataPoints, centers):
    assignments = ()
    for point in dataPoints:
        shortest = float('inf')
        shortestIndex, distances = 0, 0
        for i in range(len(centers)):
            val = distance(point, centers(i))
            distances += val
            if val < shortest:
                shortest = val
                shortestIndex = i
        assignments.append(shortestIndex)
    return (assignments, distances)

def distance(a, b):
    dimensions = len(a)
    _sum = 0
    for dimension in range(dimensions):
        differenceSq = (a(dimension) - b(dimension)) ** 2
        _sum += differenceSq
    return math.sqrt(_sum)

def generateK(dataSet, k):
    centers = ()
    dimensions = len(dataSet(0))
    minMax = defaultdict(int)
    for point in dataSet:
        for i in range(dimensions):
            val = point(i)
            minKey = 'min_%d' % i
            maxKey = 'max_%d' % i
            if minKey not in minMax or val < minMax(minKey):
                minMax(minKey) = val
            if maxKey not in minMax or val > minMax(maxKey):
                minMax(maxKey) = val
    for _k in range(k):
        randPoint = ()
        for i in range(dimensions):
            minVal = minMax('min_%d' % i)
            maxVal = minMax('max_%d' % i)
            randPoint.append(random.uniform(minVal, maxVal))
        centers.append(randPoint)
    return centers

def kMeans(dataset, k):
    kPoints = generateK(dataset, k)
    assignments = assignPoints(dataset, kPoints)(0)
    oldAssignments = None
    while assignments != oldAssignments:
        updatedCenters = updateCenters(dataset, assignments)
        newCenters = updatedCenters(0)
        means = updatedCenters(1)
        oldAssignments = assignments
        assignedPoints = assignPoints(dataset, newCenters)
        assignments = assignedPoints(0)
        sumDistances = assignedPoints(1) 
    print(sumDistances)
    return (newCenters, means)
    
if __name__ == '__main__':

    # sys.stdin  = open('input.txt', 'r')
    # sys.stdout = open('output.txt', 'w')
    
    i, points = 0, ()
    for line in sys.stdin:
        line = line.rstrip()
        line = line.split()
        line(0), line(1) = int(line(0)), int(line(1))
        if (i == 0):
            n, k = line(0), line(1)
        else:
            points.append(line)
        i += 1
        
    clusters = kMeans(points, k)
    for i in range(k):
        print('%g' % clusters(0)(i)(0), '%g' % clusters(0)(i)(1), len(clusters(1)(i)), end=' ')
        for j in range(len(clusters(1)(i))):
            print(points.index(clusters(1)(i)(j)) + 1, end=' ')
        print()

Example input:

6 3

1 2

5 5

7 3

2 4

9 9

3 6

Output:

10.8982

1.5 3 2 1 4

3 6 1 6

7 5.66667 3 2 3 5

Sometimes, the same input, gives k-1, k-2 clusters instead of k.

So my point is that. k means does not guarantee k clusters. I am required to return exact k clusters, how should I do that?