Source code for pewanalytics.stats.clustering

from __future__ import print_function

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


[docs]def compute_kmeans_clusters(features, k=10, return_score=False): """ Uses K-Means to cluster an arbitrary set of features. This function expects input data where the rows are units \ and columns are features. :param features: TF-IDF sparse matrix or :py:class:`pandas.DataFrame` :param k: The number of clusters to extract :type k: int :param return_score: If True, the function returns a tuple with the cluster \ assignments and the silhouette score of the clustering; otherwise the function just returns a list of cluster \ labels for each row. (Default=False) :type return_score: bool :return: A list with the cluster label for each row, or a tuple containing the \ labels followed by the silhouette score of the K-Means model. :rtype: list Usage:: from pewanalytics.stats.clustering import compute_kmeans_clusters from sklearn import datasets import pandas as pd # The iris dataset is a common example dataset included in scikit-learn with 3 main clusters # Let's see if we can find them df = pd.DataFrame(datasets.load_iris().data) >>> df['cluster'] = compute_kmeans_clusters(df, k=3) KMeans: n_clusters 3, score is 0.5576853964035263 >>> df['cluster'].value_counts() 1 62 0 50 2 38 Name: cluster, dtype: int64 """ km = KMeans(n_clusters=k) labels = km.fit_predict(features) silhouette_avg = silhouette_score(features, labels) print("KMeans: n_clusters {}, score is {}".format(k, silhouette_avg)) if not return_score: return km.labels_.tolist() else: return (km.labels_.tolist(), silhouette_avg)
[docs]def compute_hdbscan_clusters(features, min_cluster_size=100, min_samples=1, **kwargs): """ Uses HDBSCAN* to identify the best number of clusters and map each unit to one. This function expects input data \ where the rows are units and columns are features. Additional keyword arguments are passed to HDBSCAN. Check \ out the official documentation for more: https://hdbscan.readthedocs.io/en/latest :param features: TF-IDF sparse matrix or :py:class:`pandas.DataFrame` :param min_cluster_size: int - minimum number of documents/units that can exist in a cluster. :type min_cluster_size: int :param min_samples: Minimum number of samples to draw (see HDBSCAN documentation for more) :type min_samples: int :param kwargs: Additional HDBSCAN parameters: https://hdbscan.readthedocs.io/en/latest/parameter_selection.html :return: A list with the cluster label for each row Usage:: from pewanalytics.stats.clustering import compute_hdbscan_clusters from sklearn import datasets import pandas as pd df = pd.DataFrame(datasets.load_iris().data) >>> df['cluster'] = compute_hdbscan_clusters(df, min_cluster_size=10) HDBSCAN: n_clusters 2 >>> df['cluster'].value_counts() 1 100 0 50 Name: cluster, dtype: int64 """ import hdbscan clusterer = hdbscan.HDBSCAN( min_cluster_size=min_cluster_size, min_samples=min_samples, **kwargs ) clusterer.fit(features) print("HDBSCAN: n_clusters {}".format(clusterer.labels_.max() + 1)) return clusterer.labels_