Source code for pewanalytics.stats.clustering

from __future__ import print_function

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


[docs]def compute_kmeans_clusters(features, k=10, return_score=False):
    """
    Uses K-Means to cluster an arbitrary set of features. This function expects input data where the rows are units \
    and columns are features.

    :param features: TF-IDF sparse matrix or :py:class:`pandas.DataFrame`
    :param k: The number of clusters to extract
    :type k: int
    :param return_score: If True, the function returns a tuple with the cluster \
    assignments and the silhouette score of the clustering; otherwise the function just returns a list of cluster \
    labels for each row. (Default=False)
    :type return_score: bool
    :return: A list with the cluster label for each row, or a tuple containing the \
    labels followed by the silhouette score of the K-Means model.
    :rtype: list

    Usage::

        from pewanalytics.stats.clustering import compute_kmeans_clusters
        from sklearn import datasets
        import pandas as pd

        # The iris dataset is a common example dataset included in scikit-learn with 3 main clusters
        # Let's see if we can find them
        df = pd.DataFrame(datasets.load_iris().data)

        >>> df['cluster'] = compute_kmeans_clusters(df, k=3)
        KMeans: n_clusters 3, score is 0.5576853964035263

        >>> df['cluster'].value_counts()
        1    62
        0    50
        2    38
        Name: cluster, dtype: int64

    """

    km = KMeans(n_clusters=k)
    labels = km.fit_predict(features)
    silhouette_avg = silhouette_score(features, labels)
    print("KMeans: n_clusters {}, score is {}".format(k, silhouette_avg))
    if not return_score:
        return km.labels_.tolist()
    else:
        return (km.labels_.tolist(), silhouette_avg)


[docs]def compute_hdbscan_clusters(features, min_cluster_size=100, min_samples=1, **kwargs):
    """
    Uses HDBSCAN* to identify the best number of clusters and map each unit to one. This function expects input data \
    where the rows are units and columns are features. Additional keyword arguments are passed to HDBSCAN. Check \
    out the official documentation for more: https://hdbscan.readthedocs.io/en/latest

    :param features: TF-IDF sparse matrix or :py:class:`pandas.DataFrame`
    :param min_cluster_size: int - minimum number of documents/units that can exist in a cluster.
    :type min_cluster_size: int
    :param min_samples: Minimum number of samples to draw (see HDBSCAN documentation for more)
    :type min_samples: int
    :param kwargs: Additional HDBSCAN parameters: https://hdbscan.readthedocs.io/en/latest/parameter_selection.html
    :return: A list with the cluster label for each row

    Usage::

        from pewanalytics.stats.clustering import compute_hdbscan_clusters
        from sklearn import datasets
        import pandas as pd

        df = pd.DataFrame(datasets.load_iris().data)

        >>> df['cluster'] = compute_hdbscan_clusters(df, min_cluster_size=10)
        HDBSCAN: n_clusters 2

        >>> df['cluster'].value_counts()
        1    100
        0     50
        Name: cluster, dtype: int64

    """

    import hdbscan

    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size, min_samples=min_samples, **kwargs
    )
    clusterer.fit(features)
    print("HDBSCAN: n_clusters {}".format(clusterer.labels_.max() + 1))
    return clusterer.labels_