Source code for pewanalytics.stats.dimensionality_reduction

from __future__ import print_function
from builtins import zip
from builtins import range
import pandas as pd

from mca import MCA
from sklearn.decomposition import PCA, TruncatedSVD


def _decompose(
    features, decompose_class, feature_names=None, k=20, component_prefix="component"
):
    """
    Internal function used to break apart a set of features using a scikit-learn decomposition class and return \
    the resulting matrices.

    :param features: A :py:class:`pandas.DataFrame` or sparse matrix with rows are documents and columns are features
    :param feature_names: An optional list of feature names (for sparse matrices)
    :type feature_names: list
    :param k: Number of dimensions to extract
    :type k: int
    :param component_prefix: A prefix for the column names
    :type component_prefix: str
    :return: A tuple of two :py:class:`pandas.DataFrame`s, (features x components, documents x components)
    :rtype: tuple
    """

    model = decompose_class(n_components=k)
    try:
        features = pd.DataFrame(features.todense())
    except:
        pass
    model.fit(features)
    score = sum(model.explained_variance_ratio_)
    print("Decomposition explained variance ratio: {}".format(score))
    if not feature_names:
        feature_names = features.columns
    components = pd.DataFrame(model.components_, columns=feature_names).transpose()
    print("Top features:")
    for col in components.columns:
        print(
            "Component {}: {}".format(
                col, components.sort_values(col, ascending=False)[:10].index.values
            )
        )
    components.columns = [
        "{}_{}".format(component_prefix, c) for c in components.columns
    ]
    results = pd.DataFrame(model.transform(features), index=features.index)
    results.columns = components.columns
    results[component_prefix] = results.idxmax(axis=1)
    return (components, results)


[docs]def get_pca(features, feature_names=None, k=20):

    """
    Performs PCA on a set of features. This function expects input data where the rows are units \
    and columns are features.

    For more information about how PCA is implemented, visit the \
    `Scikit-Learn Documentation <https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html>`__.

    :param features: A :py:class:`pandas.DataFrame` or sparse matrix where rows are units/observations and columns \
    are features
    :param feature_names: An optional list of feature names (for sparse matrices)
    :type feature_names: list
    :param k: Number of dimensions to extract
    :type k: int
    :return: A tuple of two :py:class:`pandas.DataFrame` s, (features x components, units x components)
    :rtype: tuple

    Usage::

        from pewanalytics.stats.dimensionality_reduction import get_pca
        from sklearn import datasets
        import pandas as pd

        df = pd.DataFrame(datasets.load_iris().data)

        >>> feature_weights, df_reduced  = get_pca(df, k=2)
        Decomposition explained variance ratio: 0.977685206318795
        Top features:
        Component 0: [2 0 3 1]
        Component 1: [1 0 3 2]

        >>> feature_weights
              pca_0     pca_1
        0  0.361387  0.656589
        1 -0.084523  0.730161
        2  0.856671 -0.173373
        3  0.358289 -0.075481

        >>> df_reduced.head()
              pca_0     pca_1    pca
        0 -2.684126  0.319397  pca_1
        1 -2.714142 -0.177001  pca_1
        2 -2.888991 -0.144949  pca_1
        3 -2.745343 -0.318299  pca_1
        4 -2.728717  0.326755  pca_1

    """

    return _decompose(
        features, PCA, feature_names=feature_names, k=k, component_prefix="pca"
    )


[docs]def get_lsa(features, feature_names=None, k=20):

    """
    Performs LSA on a set of features. This function expects input data where the rows are units \
    and columns are features.

    For more information about how LSA is implemented, visit the \
    `Scikit-Learn Documentation \
    <https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html>`__.

    :param features: A :py:class:`pandas.DataFrame` or sparse matrix with rows are units/observations and columns \
    are features
    :param feature_names: An optional list of feature names (for sparse matrices)
    :type feature_names: list
    :param k: Number of dimensions to extract
    :type k: int
    :return: A tuple of two :py:class:`pandas.DataFrame` s, (features x components, documents x components)
    :rtype: tuple

    Usage::

        from pewanalytics.stats.dimensionality_reduction import get_lsa
        from sklearn import datasets
        import pandas as pd

        df = pd.DataFrame(datasets.load_iris().data)

        >>> feature_weights, df_reduced  = get_lsa(df, k=2)
        Decomposition explained variance ratio: 0.9772093692426493
        Top features:
        Component 0: [0 2 1 3]
        Component 1: [1 0 3 2]

        >>> feature_weights
              lsa_0     lsa_1
        0  0.751108  0.284175
        1  0.380086  0.546745
        2  0.513009 -0.708665
        3  0.167908 -0.343671

        >>> df_reduced.head()
              lsa_0     lsa_1    lsa
        0  5.912747  2.302033  lsa_0
        1  5.572482  1.971826  lsa_0
        2  5.446977  2.095206  lsa_0
        3  5.436459  1.870382  lsa_0
        4  5.875645  2.328290  lsa_0

    """

    return _decompose(
        features, TruncatedSVD, feature_names=feature_names, k=k, component_prefix="lsa"
    )


[docs]def correspondence_analysis(edges, n=1):

    """
    Performs correspondence analysis on a set of features.

    Most useful in the context of network analysis, where you might wish to, for example, \
    identify the underlying dimension in a network of Twitter users by using a matrix representing whether \
    or not they follow one another (when news and political accounts are included, the \
    underlying dimension often appears to approximate the left-right political spectrum.)

    :param edges: A :py:class:`pandas.DataFrame` of NxN where both the rows and columns are "nodes" and the values \
    are some sort of closeness or similarity measure (like a cosine similarity matrix)
    :param n: The number of dimensions to extract
    :type n: int
    :return: A :py:class:`pandas.DataFrame` where rows are the units and the columns correspond to the extracted \
    dimensions.

    Usage::

        from pewanalytics.stats.dimensionality_reduction import correspondence_analysis
        import nltk
        import pandas as pd
        from sklearn.metrics.pairwise import linear_kernel
        from sklearn.feature_extraction.text import TfidfVectorizer

        nltk.download("inaugural")
        df = pd.DataFrame([
            {"speech": fileid, "text": nltk.corpus.inaugural.raw(fileid)} for fileid in nltk.corpus.inaugural.fileids()
        ])

        vec = TfidfVectorizer(min_df=10, max_df=.9).fit(df['text'])
        tfidf = vec.transform(df['text'])

        cosine_similarities = linear_kernel(tfidf)
        matrix = pd.DataFrame(cosine_similarities, columns=df['speech'])

        # Looks like the main source of variation in the language of inaugural speeches is time!

        >>> mca = correspondence_analysis(matrix)

        >>> mca.sort_values("mca_1").head()
                        node     mca_1
        57  1993-Clinton.txt -0.075508
        56    2017-Trump.txt -0.068168
        55  1997-Clinton.txt -0.061567
        54    1973-Nixon.txt -0.060698
        53     1989-Bush.txt -0.056305

        >>> mca.sort_values("mca_1").tail()
                       node     mca_1
        4    1877-Hayes.txt  0.040037
        3   1817-Monroe.txt  0.040540
        2     1845-Polk.txt  0.042847
        1   1849-Taylor.txt  0.050937
        0  1829-Jackson.txt  0.056201


    """

    mca_counts = MCA(edges)
    rows = []
    for r in sorted(
        zip(edges.columns, [m for m in mca_counts.fs_r(N=n)]),
        key=lambda x: x[1][0],
        reverse=True,
    ):
        row = {"node": r[0]}
        for i in range(n):
            try:
                row["mca_{}".format(i + 1)] = r[1][i]
            except:
                pass
        rows.append(row)
    mca = pd.DataFrame(rows)

    return mca