import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


def plot_clusters(data, cluster):
    sns.scatterplot(x='x', y='y', data=data, hue=cluster, palette='tab10')
    plt.title('k-means Clustering')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')


rng = np.random.default_rng(seed=0)
data = pd.DataFrame({'x': rng.normal(size=35),
                     'y': rng.normal(size=35)})


# set a fixed cluster number for all points
plot_clusters(data, 0)


from sklearn.cluster import KMeans

kmeans = KMeans(init="random",
                n_clusters=4,
                n_init=1,
                random_state=0)
kmeans.fit(data)

KMeans(init='random', n_clusters=4, n_init=1, random_state=0)


clusters = kmeans.labels_
plot_clusters(data, clusters)


plot_clusters(data, 0)


k = 4


def k_means_clustering(data, k):
    pass # we will fill this soon


def k_means_clustering(data, k):
    # select k centroids from the initial points
    centroids = data.sample(n=k, random_state=0).reset_index()
  
    return centroids


centroids = k_means_clustering(data, k)
plot_clusters(data, 0)
sns.scatterplot(x='x', y='y', data=centroids,
                s=100, marker='s', hue=[0, 1, 2, 3],
                palette=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'],
                legend=False)
plt.show()


def k_means_clustering(data, k):
    # select k centroids from the initial points
    centroids = data.sample(n=k, random_state=0).reset_index()
    
    # compute distances (Euclidean norm)
    distances = pd.DataFrame()
    for cent in range(k):
        distances[cent] = np.sqrt((data['x']-centroids.at[cent, 'x'])**2
                                  + (data['y']-centroids.at[cent, 'y'])**2)

    # assign data to centroids
    cluster_assignment = distances.apply('idxmin', axis=1)
  
    return cluster_assignment, centroids


clusters, centroids = k_means_clustering(data, k)

plot_clusters(data, clusters)
sns.scatterplot(x='x', y='y', data=centroids,
                s=100, marker='s', hue=[0, 1, 2, 3],
                palette=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'],
                legend=False)
plt.show()


def k_means_clustering(data, k):
    # select k centroids from the initial points
    centroids = data.sample(n=k, random_state=0).reset_index()
    
    # compute distances (Euclidean norm)
    distances = pd.DataFrame()
    for cent in range(k):
        distances[cent] = np.sqrt((data['x']-centroids.at[cent, 'x'])**2
                                  + (data['y']-centroids.at[cent, 'y'])**2)

    # assign data to centroids
    cluster_assignments = distances.apply('idxmin', axis=1)
    
    # update centroids based on mean of the assigned data points
    new_centroids = pd.DataFrame(columns=['x', 'y'], index=range(k))
    for cent in range(k):
        new_centroids.iloc[cent] = pd.Series({'x': np.mean(data[cluster_assignments == cent].x),
                                              'y': np.mean(data[cluster_assignments == cent].y)})
  
    return cluster_assignments, new_centroids


clusters, centroids = k_means_clustering(data, k)

plot_clusters(data, clusters)
sns.scatterplot(x='x', y='y', data=centroids,
                s=100, marker='s', hue=[0, 1, 2, 3],
                palette=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'],
                legend=False)
plt.show()


def k_means_clustering(data, k):
    # select k centroids from the initial points
    centroids = data.sample(n=k, random_state=0).reset_index()
    
    # stopping criterion
    converged = False

    # iteration
    while not converged:
    
        # compute distances (Euclidean norm)
        distances = pd.DataFrame()
        for cent in range(k):
            distances[cent] = np.sqrt((data['x']-centroids.at[cent, 'x'])**2
                                      + (data['y']-centroids.at[cent, 'y'])**2)
        
        # assign data to centroids
        cluster_assignments = distances.apply('idxmin', axis=1)
    
        # update centroids based on mean of the assigned data points
        new_centroids = pd.DataFrame(columns=['x', 'y'], index=range(k))
        for cent in range(k):
            new_centroids.iloc[cent] = pd.Series({'x': np.mean(data[cluster_assignments == cent].x),
                                                  'y': np.mean(data[cluster_assignments == cent].y)})
        
        # Check for convergence
        if new_centroids.equals(centroids):
              converged = True
        else:
            centroids = new_centroids
  
    return cluster_assignments, new_centroids


clusters, centroids = k_means_clustering(data, k)

plot_clusters(data, clusters)
sns.scatterplot(x='x', y='y', data=centroids,
                s=100, marker='s', hue=[0, 1, 2, 3],
                palette=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'],
                legend=False)
plt.show()


import urllib.request

fp = urllib.request.urlopen("https://userpage.fu-berlin.de/soga/soga-py/citation/citation_py.html")
mybytes = fp.read()

citation = mybytes.decode("utf8")
fp.close()

from IPython.display import display, HTML
display(HTML(citation))

k-means Clustering¶

The k-means algorithm¶

Step 1: Choose number of clusters¶

Step 2: Initialize centroids¶

Step 3: Assign each data point to its closest centroid¶

Step 4: Recompute the centroid according to arithmetic mean¶

Step 5: Repeat step 3 and 4 until stop condition¶

Properties¶