I have done clustering using Kmeans using sklearn. While it has a method to print the centroids, I am finding it rather bizzare that scikit-learn doesn't have a method to find out the cluster length (or that I have not seen it so far). Is there a neat way to get the cluster-length of each cluster or many points associated with cluster? I currently have this rather cludgy code to do it where I am finding cluster of length one and need to add other point to this cluster by measuring the Euclidean distance between the points and have to update the labels
import numpy as np
from clustering.clusternew import Kmeans_clu
from evolution.generate import reproduction
from mapping.somnew import mapping, no_of_neurons, neuron_weights_init
from population_creation.population import pop_create
from New_SOL import newsol
data = genfromtxt('iris.csv', delimiter=',', skip_header=0, usecols=range(0, 4)) ##Read the input data
actual_label = genfromtxt('iris.csv', delimiter=',', dtype=str,skip_header=0, usecols=(4))
chromosome = int(input("Enter the number of chromosomes: ")) #Input the population size
max_gen = int(input("Enter the maximum number of generation: ")) #Input the maximum number of generation
for i in range(0, chromosome):
cluster = 3#random.randint(2, max_cluster) ##Randomly selects cluster number from 2 to root(poplation)
K.insert(i, cluster) ##Store the number of clusters in clu
print('value of K is ',K)
u, label,z1,A1= Kmeans_clu(cluster, data)
#print("centers and labels : ", u, label)
lab.insert(i, label) ##Store the labels in lab
center.insert(i, u)
new_center = pop_create(max_cluster, features, cluster, u)
population.insert(i, new_center)
print("VAlue of population in main\n" ,population)
newsol(max_gen,population,data)
For newsol method we pass the new population from the above method generated code and again doing K-Means on the population
def ClusterIndicesComp(clustNum, labels_array): #list comprehension for accessing the features in iris data set
return np.array([i for i, x in enumerate(labels_array) if x == clustNum])
def newsol(max_gen,population,data):
#print('VAlue of NewSol Population is',population)
for i in range(max_gen):
cluster1=5
u,label,t,l=Kmeans_clu(cluster1, population)
A1.insert(i,t)
plab.insert(i,label)
pcenter.insert(i,u)
k2=Counter(l.labels_) #Count number of elements in each cluster
k1=[t for (t, v) in k2.items() if v == 1] #element whose length is one will be fetched
t1= np.array(k1) #Iterating through the cluster which have one point associated with them
for b in range(len(t1)):
print("Value in NEW_SOL is of 1 length cluster\n",t1[b])
plot1=data[ClusterIndicesComp(t1[b], l.labels_)]
print("Values are in sol of plot1",plot1)
for q in range(cluster1):
plot2=data[ClusterIndicesComp(q, l.labels_)]
print("VAlue of plot2 is for \n",q,plot2)
for i in range(len(plot2)):#To get one element at a time from plot2
plotk=plot2[i]
if([t for (t, v) in k2.items() if v >2]):#checking if the cluster have more than 2 points than only the distance will be calculated
S=np.linalg.norm(np.array(plot1) - np.array(plotk))
print("Distance between plot1 and plotk is",plot1,plotk,np.linalg.norm(np.array(plot1) - np.array(plotk)))#euclidian distance is calculated
else:
print("NO distance between them\n")
Kmeans which I have done is
from sklearn.cluster import KMeans
import numpy as np
def Kmeans_clu(K, data):
kmeans = KMeans(n_clusters=K, init='random', max_iter=1, n_init=1).fit(data) ##Apply k-means clustering
labels = kmeans.labels_
clu_centres = kmeans.cluster_centers_
z={i: np.where(kmeans.labels_ == i)[0] for i in range(kmeans.n_clusters)} #getting cluster for each label
return clu_centres, labels ,z,kmeans
For getting number of instances in each cluster may be you can try using Counter
:
from collections import Counter, defaultdict
print(Counter(estimator.labels_))
Result:
Counter({0: 62, 1: 50, 2: 38})
where cluster 0 has 62 instances, cluster 1 has 50 instances, and cluster 2 has 38 instances
And may be to store index of instances of each clusters, you can use defaultdict
:
clusters_indices = defaultdict(list)
for index, c in enumerate(estimator.labels_):
clusters_indices[c].append(index)
Now, to find indices of instances in cluster 0, calling:
print(clusters_indices[0])
Result:
[50, 51, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
71, 72, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92,
93, 94, 95, 96, 97, 98, 99, 101, 106, 113, 114, 119, 121, 123, 126, 127, 133, 138, 142, 146, 149]