Tutorial Diversity Measures#
This tutorial demonstrates how to quantify the diversity of selected subset with diversity
module as implemented in
selector
package. The diversity measures are calculated based on the feature matrix of the selected subset.
import sys
import warnings
warnings.filterwarnings("ignore")
# uncomment the following line to run the code for your own project directory
# sys.path.append("/Users/Someone/Documents/projects/Selector")
import matplotlib.pylab as plt
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.metrics.pairwise import pairwise_distances
from IPython.display import Markdown
from selector.methods.distance import MaxMin, MaxSum, OptiSim, DISE
from selector.diversity import compute_diversity, hypersphere_overlap_of_subset
Utility Function for Showing Diversity Measures as A Table#
# define function to render tables easier
def render_table(data, caption=None, decimals=3):
"""Renders a list of lists in ta markdown table for easy visualization.
Parameters
----------
data : list of lists
The data to be rendered in a table, each inner list represents a row with the first row
being the header.
caption : str, optional
The caption of the table.
decimals : int, optional
The number of decimal places to round the data to.
"""
# check all rows have the same number of columns
if not all(len(row) == len(data[0]) for row in data):
raise ValueError("Expect all rows to have the same number of columns.")
if caption is not None:
# check if caption is a string
if not isinstance(caption, str):
raise ValueError("Expect caption to be a string.")
tmp_output = f"**{caption}**\n\n"
# get the width of each column (transpose the data list and get the max length of each new row)
colwidths = [max(len(str(s)) for s in col) + 2 for col in zip(*data)]
# construct the header row
header = f"| {' | '.join(f'{str(s):^{w}}' for s, w in zip(data[0], colwidths))} |"
tmp_output += header + "\n"
# construct a separator row
separator = f"|{'|'.join(['-' * w for w in colwidths])}|"
tmp_output += separator + "\n"
# construct the data rows
for row in data[1:]:
# round the data to the specified number of decimal places
row = [round(s, decimals) if isinstance(s, float) else s for s in row]
row_str = f"| {' | '.join(f'{str(s):^{w}}' for s, w in zip(row, colwidths))} |"
tmp_output += row_str + "\n"
return display(Markdown(tmp_output))
Generating Data#
The data should be provided as:
either an array
X
of shape(n_samples, n_features)
encodingn_samples
samples (rows) each inn_features
-dimensional (columns) feature space,or an array
X_dist
of shape(n_samples, n_samples)
encoding the distance (i.e., dissimilarity) between each pair ofn_samples
sample points.
This data can be loaded from various file formats (e.g., csv, npz, txt, etc.) or generated using various libraries on the fly. In this tutorial, we use sklearn.datasets.make_blobs
to generate cluster(s) of n_samples
points in 2-dimensions (n-features=2
), so that it can be easily visualized. However, the same functionality can be applied to higher dimensional datasets.
# generate n_sample data in 2D feature space forming 1 cluster
X, labels = make_blobs(
n_samples=500,
n_features=20,
# centers=np.array([[0.0, 0.0]]),
random_state=42,
)
# binarize the fetures
# Calculate median for each feature
median_threshold = np.median(X, axis=0)
X = (X > median_threshold).astype(int)
# compute the (n_sample, n_sample) pairwise distance matrix
X_dist = pairwise_distances(X, metric="euclidean")
print("Shape of data = ", X.shape)
print("Shape of labels = ", labels.shape)
print("Unique labels = ", np.unique(labels))
print("Cluster size = ", np.count_nonzero(labels == 0))
print("Shape of the distance array = ", X_dist.shape)
Shape of data = (500, 20)
Shape of labels = (500,)
Unique labels = [0 1 2]
Cluster size = 167
Shape of the distance array = (500, 500)
Perform the Subset Selection#
from scipy.spatial.distance import pdist, squareform
# select data using distance base methods
# ---------------------------------------
size = 50
collector = MaxMin()
index_maxmin = collector.select(X_dist, size=size)
collector = MaxSum(fun_dist=lambda x: squareform(pdist(x, metric="minkowski", p=0.1)))
index_maxsum = collector.select(X, size=size)
collector = OptiSim(ref_index=0, tol=0.1)
index_optisim = collector.select(X_dist, size=size)
collector = DISE(ref_index=0, p=2.0)
index_dise = collector.select(X, size=size)
div_measure = ["logdet", "wdud", "shannon_entropy", "hypersphere_overlap"]
seleced_sets = zip(
["MaxMin", "MaxSum", "OptiSim", "DISE"],
[index_maxmin, index_maxsum, index_optisim, index_dise],
)
# compute the diversity of the selected sets and render the results in a table
table_data = [[""] + div_measure]
for i in seleced_sets:
div_data = [i[0]]
for m in div_measure:
if m != "hypersphere_overlap":
div_data.append(compute_diversity(X[i[1]], div_type=m))
else:
div_data.append(hypersphere_overlap_of_subset(x=X, x_subset=X[i[1]]))
table_data.append(div_data)
render_table(table_data, caption="Diversity of Selected Sets")
Diversity of Selected Sets
| | logdet | wdud | shannon_entropy | hypersphere_overlap | |———|——————–|———————|——————–|———————| | MaxMin | 44.143 | 0.273 | 18.637 | 1299.615 | | MaxSum | 33.938 | 0.261 | 19.379 | 4396.672 | | OptiSim | 43.734 | 0.254 | 19.758 | 1175.49 | | DISE | 45.402 | 0.268 | 18.958 | 1363.434 |