Source code for pewanalytics.stats.dimensionality_reduction

from __future__ import print_function
from builtins import zip
from builtins import range
import pandas as pd

from mca import MCA
from sklearn.decomposition import PCA, TruncatedSVD


def _decompose(
    features, decompose_class, feature_names=None, k=20, component_prefix="component"
):
    """
    Internal function used to break apart a set of features using a scikit-learn decomposition class and return \
    the resulting matrices.

    :param features: A :py:class:`pandas.DataFrame` or sparse matrix with rows are documents and columns are features
    :param feature_names: An optional list of feature names (for sparse matrices)
    :type feature_names: list
    :param k: Number of dimensions to extract
    :type k: int
    :param component_prefix: A prefix for the column names
    :type component_prefix: str
    :return: A tuple of two :py:class:`pandas.DataFrame`s, (features x components, documents x components)
    :rtype: tuple
    """

    model = decompose_class(n_components=k)
    try:
        features = pd.DataFrame(features.todense())
    except:
        pass
    model.fit(features)
    score = sum(model.explained_variance_ratio_)
    print("Decomposition explained variance ratio: {}".format(score))
    if not feature_names:
        feature_names = features.columns
    components = pd.DataFrame(model.components_, columns=feature_names).transpose()
    print("Top features:")
    for col in components.columns:
        print(
            "Component {}: {}".format(
                col, components.sort_values(col, ascending=False)[:10].index.values
            )
        )
    components.columns = [
        "{}_{}".format(component_prefix, c) for c in components.columns
    ]
    results = pd.DataFrame(model.transform(features), index=features.index)
    results.columns = components.columns
    results[component_prefix] = results.idxmax(axis=1)
    return (components, results)


[docs]def get_pca(features, feature_names=None, k=20): """ Performs PCA on a set of features. This function expects input data where the rows are units \ and columns are features. For more information about how PCA is implemented, visit the \ `Scikit-Learn Documentation <https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html>`__. :param features: A :py:class:`pandas.DataFrame` or sparse matrix where rows are units/observations and columns \ are features :param feature_names: An optional list of feature names (for sparse matrices) :type feature_names: list :param k: Number of dimensions to extract :type k: int :return: A tuple of two :py:class:`pandas.DataFrame` s, (features x components, units x components) :rtype: tuple Usage:: from pewanalytics.stats.dimensionality_reduction import get_pca from sklearn import datasets import pandas as pd df = pd.DataFrame(datasets.load_iris().data) >>> feature_weights, df_reduced = get_pca(df, k=2) Decomposition explained variance ratio: 0.977685206318795 Top features: Component 0: [2 0 3 1] Component 1: [1 0 3 2] >>> feature_weights pca_0 pca_1 0 0.361387 0.656589 1 -0.084523 0.730161 2 0.856671 -0.173373 3 0.358289 -0.075481 >>> df_reduced.head() pca_0 pca_1 pca 0 -2.684126 0.319397 pca_1 1 -2.714142 -0.177001 pca_1 2 -2.888991 -0.144949 pca_1 3 -2.745343 -0.318299 pca_1 4 -2.728717 0.326755 pca_1 """ return _decompose( features, PCA, feature_names=feature_names, k=k, component_prefix="pca" )
[docs]def get_lsa(features, feature_names=None, k=20): """ Performs LSA on a set of features. This function expects input data where the rows are units \ and columns are features. For more information about how LSA is implemented, visit the \ `Scikit-Learn Documentation \ <https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html>`__. :param features: A :py:class:`pandas.DataFrame` or sparse matrix with rows are units/observations and columns \ are features :param feature_names: An optional list of feature names (for sparse matrices) :type feature_names: list :param k: Number of dimensions to extract :type k: int :return: A tuple of two :py:class:`pandas.DataFrame` s, (features x components, documents x components) :rtype: tuple Usage:: from pewanalytics.stats.dimensionality_reduction import get_lsa from sklearn import datasets import pandas as pd df = pd.DataFrame(datasets.load_iris().data) >>> feature_weights, df_reduced = get_lsa(df, k=2) Decomposition explained variance ratio: 0.9772093692426493 Top features: Component 0: [0 2 1 3] Component 1: [1 0 3 2] >>> feature_weights lsa_0 lsa_1 0 0.751108 0.284175 1 0.380086 0.546745 2 0.513009 -0.708665 3 0.167908 -0.343671 >>> df_reduced.head() lsa_0 lsa_1 lsa 0 5.912747 2.302033 lsa_0 1 5.572482 1.971826 lsa_0 2 5.446977 2.095206 lsa_0 3 5.436459 1.870382 lsa_0 4 5.875645 2.328290 lsa_0 """ return _decompose( features, TruncatedSVD, feature_names=feature_names, k=k, component_prefix="lsa" )
[docs]def correspondence_analysis(edges, n=1): """ Performs correspondence analysis on a set of features. Most useful in the context of network analysis, where you might wish to, for example, \ identify the underlying dimension in a network of Twitter users by using a matrix representing whether \ or not they follow one another (when news and political accounts are included, the \ underlying dimension often appears to approximate the left-right political spectrum.) :param edges: A :py:class:`pandas.DataFrame` of NxN where both the rows and columns are "nodes" and the values \ are some sort of closeness or similarity measure (like a cosine similarity matrix) :param n: The number of dimensions to extract :type n: int :return: A :py:class:`pandas.DataFrame` where rows are the units and the columns correspond to the extracted \ dimensions. Usage:: from pewanalytics.stats.dimensionality_reduction import correspondence_analysis import nltk import pandas as pd from sklearn.metrics.pairwise import linear_kernel from sklearn.feature_extraction.text import TfidfVectorizer nltk.download("inaugural") df = pd.DataFrame([ {"speech": fileid, "text": nltk.corpus.inaugural.raw(fileid)} for fileid in nltk.corpus.inaugural.fileids() ]) vec = TfidfVectorizer(min_df=10, max_df=.9).fit(df['text']) tfidf = vec.transform(df['text']) cosine_similarities = linear_kernel(tfidf) matrix = pd.DataFrame(cosine_similarities, columns=df['speech']) # Looks like the main source of variation in the language of inaugural speeches is time! >>> mca = correspondence_analysis(matrix) >>> mca.sort_values("mca_1").head() node mca_1 57 1993-Clinton.txt -0.075508 56 2017-Trump.txt -0.068168 55 1997-Clinton.txt -0.061567 54 1973-Nixon.txt -0.060698 53 1989-Bush.txt -0.056305 >>> mca.sort_values("mca_1").tail() node mca_1 4 1877-Hayes.txt 0.040037 3 1817-Monroe.txt 0.040540 2 1845-Polk.txt 0.042847 1 1849-Taylor.txt 0.050937 0 1829-Jackson.txt 0.056201 """ mca_counts = MCA(edges) rows = [] for r in sorted( zip(edges.columns, [m for m in mca_counts.fs_r(N=n)]), key=lambda x: x[1][0], reverse=True, ): row = {"node": r[0]} for i in range(n): try: row["mca_{}".format(i + 1)] = r[1][i] except: pass rows.append(row) mca = pd.DataFrame(rows) return mca