Source code for pewanalytics.stats.sampling

from __future__ import print_function
from __future__ import division
from builtins import zip
from builtins import range
from builtins import object
import random
import numpy as np


[docs]def compute_sample_weights_from_frame(frame, sample, weight_vars):

    """
    Takes two :py:class:`pandas.DataFrame` s and computes sampling weights for the second one, based on the first. \
    The first :py:class:`pandas.DataFrame` should be equivalent to the population that the second \
    :py:class:`pandas.DataFrame`, a sample, was drawn from. Weights will be calculated based on the differences in \
    the distribution of one or more variables specified in ``weight_vars`` (these should be the names of columns). \
    Returns a :py:class:`pandas.Series` equal in length to the ``sample`` with the computed weights.

    :param frame: :py:class:`pandas.DataFrame` (must contain all of the columns specified in ``weight_vars``)
    :param sample: :py:class:`pandas.DataFrame` (must contain all of the columns specified in ``weight_vars``)
    :param weight_vars: The names of the columns to use when computing weights.
    :type weight_vars: list
    :return: A :py:class:`pandas.Series` containing the weights for each row in the ``sample``

    Usage::

        from pewanalytics.stats.sampling import compute_sample_weights_from_frame
        import nltk
        import pandas as pd
        from sklearn.metrics.pairwise import linear_kernel
        from sklearn.feature_extraction.text import TfidfVectorizer

        nltk.download("inaugural")
        frame = pd.DataFrame([
            {"speech": fileid, "text": nltk.corpus.inaugural.raw(fileid)} for fileid in nltk.corpus.inaugural.fileids()
        ])
        # Let's grab a sample of speeches - some that mention specific terms, and an additional random sample
        frame['economy'] = frame['text'].str.contains("economy").astype(int)
        frame['health'] = frame['text'].str.contains("health").astype(int)
        frame['immigration'] = frame['text'].str.contains("immigration").astype(int)
        frame['education'] = frame['text'].str.contains("education").astype(int)
        sample = pd.concat([
            frame[frame['economy']==1].sample(5),
            frame[frame['health']==1].sample(5),
            frame[frame['immigration']==1].sample(5),
            frame[frame['education']==1].sample(5),
            frame.sample(5)
        ])
        # Now we can get the sampling weights to adjust it back to the population based on those variables

        >>> sample['weight'] = compute_sample_weights_from_frame(frame, sample, ["economy", "health", "immigration", "education"])
        >>> sample
                       speech                                               text  economy  health  immigration  education  count    weight
        7     1817-Monroe.txt  I should be destitute of feeling if I was not ...        1       1            0          0      1  1.005747
        11   1833-Jackson.txt  Fellow citizens, the will of the American peop...        1       0            0          0      1  2.370690
        34  1925-Coolidge.txt  My countrymen, no one can contemplate curre...           1       0            1          1      1  0.344828
        35    1929-Hoover.txt  My Countrymen: This occasion is not alone the ...        1       1            0          1      1  0.538793
        28  1901-McKinley.txt  My fellow-citizens, when we assembled here on ...        1       0            0          0      1  2.370690

    """

    if len(weight_vars) > 0:

        frame["count"] = 1
        sample["count"] = 1
        sample_grouped = sample.groupby(weight_vars).count()
        sample_grouped /= len(sample)
        frame_grouped = frame.groupby(weight_vars).count()
        frame_grouped /= len(frame)
        weights = frame_grouped / sample_grouped
        weights["weight"] = weights["count"]
        for c in weights.columns:
            if c not in weight_vars and c != "weight":
                del weights[c]
        try:
            sample = sample.merge(
                weights, how="left", left_on=weight_vars, right_index=True
            )
        except ValueError:
            weights = weights.reset_index()
            index = sample.index
            sample = sample.merge(
                weights, how="left", left_on=weight_vars, right_on=weight_vars
            )
            sample.index = index
    else:
        sample["weight"] = 1.0

    return sample["weight"]


[docs]def compute_balanced_sample_weights(sample, weight_vars, weight_column=None):

    """
    Takes a :py:class:`pandas.DataFrame` and one or more column names (``weight_vars``) and computes weights such \
    that every unique combination of values in the weighting columns are balanced (when weighted, the sum of the \
    observations with each combination will be equal to one another). Useful for balancing important groups in \
    training datasets, etc.

    :param sample: :py:class:`pandas.DataFrame` (must contain all of the columns specified in ``weight_vars``)
    :param weight_vars: The names of the columns to use when computing weights.
    :type weight_vars: list
    :param weight_column: An option column containing existing weights, which can be factored into the new weights.
    :type weight_column: str
    :return: A :py:class:`pandas.Series` containing the weights for each row in the ``sample``

    .. note:: All weight variables must be binary flags (1 or 0); if you want to weight using a non-binary variable, \
        you should convert it into a set of dummy variables and then pass those in as multiple columns.
        
    Usage::

        from pewanalytics.stats.sampling import compute_balanced_sample_weights
        import pandas as pd

        # Let's say we have a set of tweets from members of Congress
        df = pd.DataFrame([
            {"politician_id": 1, "party": "R", "tweet": "Example document"},
            {"politician_id": 1, "party": "R", "tweet": "Example document"},
            {"politician_id": 2, "party": "D", "tweet": "Example document"},
            {"politician_id": 2, "party": "D", "tweet": "Example document"},
            {"politician_id": 3, "party": "D", "tweet": "Example document"},
        ])
        df['is_republican'] = (df['party']=="R").astype(int)

        # We can balance the parties like so:

        >>> df['weight'] = compute_balanced_sample_weights(df, ["is_republican"])

        >>> df
           politician_id party             tweet  is_rep    weight  is_republican
        0              1     R  Example document       1  1.250000              1
        1              1     R  Example document       1  1.250000              1
        2              2     D  Example document       0  0.833333              0
        3              2     D  Example document       0  0.833333              0
        4              3     D  Example document       0  0.833333              0

    """

    if len(weight_vars) > 0:

        num_valid_combos = 0
        weight_vars = list(set(weight_vars))
        combo_weights = {}
        combos = list(
            set(
                [
                    tuple(row[weight_vars].values.astype(bool))
                    for index, row in sample.iterrows()
                ]
            )
        )
        for combo in combos:
            if weight_column:
                combo_weights[combo] = float(
                    sample[
                        eval(
                            " & ".join(
                                [
                                    "(sample['{}']=={})".format(col, c)
                                    for col, c in zip(weight_vars, combo)
                                ]
                            )
                        )
                    ][weight_column].sum()
                ) / float(sample[weight_column].sum())
            else:
                combo_weights[combo] = float(
                    len(
                        sample[
                            eval(
                                " & ".join(
                                    [
                                        "(sample['{}']=={})".format(col, c)
                                        for col, c in zip(weight_vars, combo)
                                    ]
                                )
                            )
                        ]
                    )
                ) / float(len(sample))
            if combo_weights[combo] > 0:
                num_valid_combos += 1
            else:
                del combo_weights[combo]

        balanced_ratio = 1.0 / float(num_valid_combos)
        combo_weights = {
            k: float(balanced_ratio) / float(v) for k, v in combo_weights.items()
        }

        sample["weight"] = sample.apply(
            lambda x: combo_weights[tuple([x[v] for v in weight_vars])], axis=1
        )

    else:
        sample["weight"] = 1.0

    return sample["weight"]


[docs]class SampleExtractor(object):
    """
    A helper class for extracting samples using various sampling methods.

    :param df: The sampling frame
    :type df: :py:class:`pandas.DataFrame`
    :param id_col: Column in the :py:class:`pandas.DataFrame` to be used as the unique ID of observations
    :type id_col: str
    :param verbose: Whether or not to print information during the sampling process (default=False)
    :type verbose: bool
    :param seed: Random seed (optional)
    :type seed: int

    """

    def __init__(self, df, id_col, verbose=False, seed=None):

        self.df = df
        self.id_col = id_col
        self.seed = seed
        self.verbose = verbose

        if not self.seed:
            self.seed = int(round(1000 * np.random.random()))

[docs]    def extract(self, sample_size, sampling_strategy="random", stratify_by=None):

        """
        Extract a sample from a :py:class:`pandas.DataFrame` using one of the following methods:

        - all: Returns all of the IDs
        - random: Returns a random sample
        - stratify: Proportional stratification, method from Kish, Leslie. "Survey sampling." (1965). Chapter 4.
        - stratify_even: Sample evenly from each strata (will obviously not be representative)
        - stratify_guaranteed: Proportional stratification, but the sample is guaranteed to contain at least one \
            observation from each strata (if sample size is small and/or there are many small strata, the resulting \
            sample may be far from representative)

        :param sample_size: The desired size of the sample
        :type sample_size: int
        :param sampling_strategy: The method to be used to extract samples. Options are: all, random, stratify, \
        stratify_even, stratify_guaranteed
        :type sampling_strategy: str
        :param stratify_by: Optional name of a column or list of columns in the :py:class:`pandas.DataFrame` to \
        stratify on
        :type stratify_by: str, list
        :return: A list of IDs reflecting the observations selected from the :py:class:`pandas.DataFrame` during \
        sampling
        :rtype: list

        Usage::

            from pewanalytics.stats.sampling import SampleExtractor
            import nltk
            import pandas as pd

            nltk.download("inaugural")
            frame = pd.DataFrame([
                {"speech": fileid, "text": nltk.corpus.inaugural.raw(fileid)} for fileid in nltk.corpus.inaugural.fileids()
            ])
            frame["century"] = frame['speech'].map(lambda x: "{}00".format(x.split("-")[0][:2]))

            >>> sampler = SampleExtractor(frame, "speech", seed=42)

            >>> sample_index = sampler.extract(12, sampling_strategy="random")
            frame[frame["speech"].isin(sample_index)]['century'].value_counts()
            1900    6
            1800    5
            1700    1
            Name: century, dtype: int64

            >>> sample_index = sampler.extract(12, sampling_strategy="stratify", stratify_by=['century'])
            frame[frame["speech"].isin(sample_index)]['century'].value_counts()
            1800    5
            1900    5
            2000    1
            1700    1
            Name: century, dtype: int64

            >>> sample_index = sampler.extract(12, sampling_strategy="stratify_even", stratify_by=['century'])
            frame[frame["speech"].isin(sample_index)]['century'].value_counts()
            1800    3
            2000    3
            1700    3
            1900    3
            Name: century, dtype: int64

            >>> sample_index = sampler.extract(12, sampling_strategy="stratify_guaranteed", stratify_by=['century'])
            frame[frame["speech"].isin(sample_index)]['century'].value_counts()
            1900    5
            1800    4
            1700    2
            2000    1
            Name: century, dtype: int64

        """

        strategies = [
            "all",
            "random",
            "stratify",
            "stratify_even",
            "stratify_guaranteed",
        ]
        if sampling_strategy not in strategies:
            raise Exception(
                "You must choose one of the following sampling strategies: {}".format(
                    strategies
                )
            )

        doc_ids = None

        if sampling_strategy == "all":
            doc_ids = self.df[self.id_col].values

        elif sampling_strategy == "random":
            doc_ids = self._random_sample(sample_size).values

        elif sampling_strategy.startswith("stratify"):

            if self.verbose:
                print("Stratify on columns: {}".format(",".join(stratify_by)))

            self.df["_stratify_by"] = (
                self.df[stratify_by].astype(str).apply("".join, axis=1)
            )

            # So you can pass in a decimal proportion of total dataframe or number of samples
            sample_n = (
                sample_size
                if sample_size >= 1
                else int(round(sample_size * self.df.shape[0]))
            )

            if sampling_strategy == "stratify":
                doc_ids = self._stratify_sample(sample_n)

            elif sampling_strategy == "stratify_even":
                doc_ids = self._stratify_even_sample(sample_n)

            elif sampling_strategy == "stratify_guaranteed":
                doc_ids = self._stratify_guaranteed_sample(sample_n)

            del self.df["_stratify_by"]

        if self.verbose:
            print("Sample of %i extracted" % (len(doc_ids)))

        return list(doc_ids)

    def _random_sample(self, sample_size):

        if self.verbose:
            print("Basic random sample")
        if sample_size >= 1:
            return self.df.sample(int(sample_size), random_state=self.seed)[self.id_col]
        else:
            return self.df.sample(frac=float(sample_size), random_state=self.seed)[
                self.id_col
            ]

    def _stratify_sample(self, sample_size):

        if self.verbose:
            print("Kish-style stratification")

        # Subset & copy cols that we care about
        data = self.df.copy()[[self.id_col] + ["_stratify_by"]]
        frame_size = data.shape[0]

        # Shuffle the dataframe
        if self.verbose:
            print("Random seed: {}".format(self.seed))
        np.random.seed(self.seed)
        if self.verbose:
            print("Dataframe before sorting: {}".format(data.head()))
        data.index = np.random.permutation(data.index)

        # Re-sort grouped by strata
        data = data.groupby("_stratify_by").apply(lambda x: x.sort_index())
        data.index = list(range(0, frame_size))
        if self.verbose:
            print("Dataframe after shuffle & groupby sorting: {}".format(data.head()))

        skip_interval = float(frame_size) / float(sample_size)

        start_index = np.random.uniform(0, skip_interval)  # index to start from
        if self.verbose:
            print("Start index: {}".format(start_index))
        sample_index = np.round(
            (np.zeros(sample_size) + start_index)
            + (np.arange(sample_size) * skip_interval)
        )

        # Return the real id column
        sample_ids = data[data.index.isin(sample_index)][self.id_col].values

        return sample_ids

    def _stratify_even_sample(self, sample_size):

        random.seed(self.seed)
        docs_per_strata = int(
            float(sample_size)
            / float(self.df.groupby("_stratify_by")[self.id_col].count().count())
        )
        if self.verbose:
            print(
                "Drawing even samples of {} across all stratification groups".format(
                    docs_per_strata
                )
            )
        doc_ids = []
        for strata in self.df["_stratify_by"].unique():
            strata_data = self.df[self.df["_stratify_by"] == strata]
            doc_ids.extend(
                list(strata_data.sample(docs_per_strata)[self.id_col].values)
            )
        if len(doc_ids) < sample_size:
            doc_ids.extend(
                list(self.df.sample(sample_size - len(doc_ids))[self.id_col].values)
            )

        return doc_ids

    def _stratify_guaranteed_sample(self, sample_size):

        # Number of groups to stratify by must be less than the total sample size
        strata_groups = self.df.groupby("_stratify_by")[self.id_col].count().count()
        if sample_size > strata_groups:
            if self.verbose:
                print(
                    "Sampling one document per strata first ({} strata total)".format(
                        strata_groups
                    )
                )
            strata_one = (
                self.df.groupby("_stratify_by")
                .apply(lambda x: x.sample(1, random_state=self.seed))[self.id_col]
                .values
            )
        else:
            if self.verbose:
                print(
                    "There are more strata groups ({}) than things to sample: {}".format(
                        strata_groups, sample_size
                    )
                )
            strata_one = (
                self.df.groupby("_stratify_by")
                .apply(lambda x: x.sample(1, random_state=self.seed))[self.id_col]
                .sample(sample_size)
                .values
            )

        left_to_sample = sample_size - len(strata_one)
        if left_to_sample > 0:
            doc_ids = SampleExtractor(
                self.df[~self.df[self.id_col].isin(strata_one)],
                self.id_col,
                seed=self.seed,
                verbose=self.verbose,
            )._stratify_sample(left_to_sample)
            doc_ids = list(doc_ids) + list(strata_one)
        else:
            if self.verbose:
                print("Nothing left to sample, no stratification applied")
            doc_ids = strata_one

        return doc_ids