Tutorial Diversity Measures#

This tutorial demonstrates how to quantify the diversity of selected subset with diversity module as implemented in selector package. The diversity measures are calculated based on the feature matrix of the selected subset.

import sys
import warnings

warnings.filterwarnings("ignore")

# uncomment the following line to run the code for your own project directory
# sys.path.append("/Users/Someone/Documents/projects/Selector")
import matplotlib.pylab as plt
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.metrics.pairwise import pairwise_distances
from IPython.display import Markdown

from selector.methods.distance import MaxMin, MaxSum, OptiSim, DISE
from selector.diversity import compute_diversity, hypersphere_overlap_of_subset

Utility Function for Showing Diversity Measures as A Table#

# define function to render tables easier


def render_table(data, caption=None, decimals=3):
    """Renders a list of lists in ta markdown table for easy visualization.

    Parameters
    ----------
    data : list of lists
        The data to be rendered in a table, each inner list represents a row with the first row
        being the header.
    caption : str, optional
        The caption of the table.
    decimals : int, optional
        The number of decimal places to round the data to.
    """

    # check all rows have the same number of columns
    if not all(len(row) == len(data[0]) for row in data):
        raise ValueError("Expect all rows to have the same number of columns.")

    if caption is not None:
        # check if caption is a string
        if not isinstance(caption, str):
            raise ValueError("Expect caption to be a string.")
        tmp_output = f"**{caption}**\n\n"

    # get the width of each column (transpose the data list and get the max length of each new row)
    colwidths = [max(len(str(s)) for s in col) + 2 for col in zip(*data)]

    # construct the header row
    header = f"| {' | '.join(f'{str(s):^{w}}' for s, w in zip(data[0], colwidths))} |"
    tmp_output += header + "\n"

    # construct a separator row
    separator = f"|{'|'.join(['-' * w for w in colwidths])}|"
    tmp_output += separator + "\n"

    # construct the data rows
    for row in data[1:]:
        # round the data to the specified number of decimal places
        row = [round(s, decimals) if isinstance(s, float) else s for s in row]
        row_str = f"| {' | '.join(f'{str(s):^{w}}' for s, w in zip(row, colwidths))} |"
        tmp_output += row_str + "\n"

    return display(Markdown(tmp_output))

Generating Data#

The data should be provided as:

  • either an array X of shape (n_samples, n_features) encoding n_samples samples (rows) each in n_features-dimensional (columns) feature space,

  • or an array X_dist of shape (n_samples, n_samples) encoding the distance (i.e., dissimilarity) between each pair of n_samples sample points.

This data can be loaded from various file formats (e.g., csv, npz, txt, etc.) or generated using various libraries on the fly. In this tutorial, we use sklearn.datasets.make_blobs to generate cluster(s) of n_samples points in 2-dimensions (n-features=2), so that it can be easily visualized. However, the same functionality can be applied to higher dimensional datasets.

# generate n_sample data in 2D feature space forming 1 cluster
X, labels = make_blobs(
    n_samples=500,
    n_features=20,
    # centers=np.array([[0.0, 0.0]]),
    random_state=42,
)
# binarize the fetures
# Calculate median for each feature
median_threshold = np.median(X, axis=0)
X = (X > median_threshold).astype(int)

# compute the (n_sample, n_sample) pairwise distance matrix
X_dist = pairwise_distances(X, metric="euclidean")

print("Shape of data   = ", X.shape)
print("Shape of labels = ", labels.shape)
print("Unique labels   = ", np.unique(labels))
print("Cluster size    = ", np.count_nonzero(labels == 0))
print("Shape of the distance array = ", X_dist.shape)
Shape of data   =  (500, 20)
Shape of labels =  (500,)
Unique labels   =  [0 1 2]
Cluster size    =  167
Shape of the distance array =  (500, 500)

Perform the Subset Selection#

from scipy.spatial.distance import pdist, squareform

# select data using distance base methods
# ---------------------------------------
size = 50

collector = MaxMin()
index_maxmin = collector.select(X_dist, size=size)

collector = MaxSum(fun_dist=lambda x: squareform(pdist(x, metric="minkowski", p=0.1)))
index_maxsum = collector.select(X, size=size)

collector = OptiSim(ref_index=0, tol=0.1)
index_optisim = collector.select(X_dist, size=size)

collector = DISE(ref_index=0, p=2.0)
index_dise = collector.select(X, size=size)
div_measure = ["logdet", "wdud", "shannon_entropy", "hypersphere_overlap"]
seleced_sets = zip(
    ["MaxMin", "MaxSum", "OptiSim", "DISE"],
    [index_maxmin, index_maxsum, index_optisim, index_dise],
)

# compute the diversity of the selected sets and render the results in a table
table_data = [[""] + div_measure]
for i in seleced_sets:
    div_data = [i[0]]
    for m in div_measure:
        if m != "hypersphere_overlap":
            div_data.append(compute_diversity(X[i[1]], div_type=m))
        else:
            div_data.append(hypersphere_overlap_of_subset(x=X, x_subset=X[i[1]]))
    table_data.append(div_data)

render_table(table_data, caption="Diversity of Selected Sets")

Diversity of Selected Sets

| | logdet | wdud | shannon_entropy | hypersphere_overlap | |———|——————–|———————|——————–|———————| | MaxMin | 44.143 | 0.273 | 18.637 | 1299.615 | | MaxSum | 33.938 | 0.261 | 19.379 | 4396.672 | | OptiSim | 43.734 | 0.254 | 19.758 | 1175.49 | | DISE | 45.402 | 0.268 | 18.958 | 1363.434 |