import numpy as np
import os
import warnings
import random
import sys
import logging
import scipy
from itertools import product, izip, chain, cycle
from collections import defaultdict
from functools import partial
from pymaptools.iter import izip_with_cycles, isiterable, take
from pymaptools.containers import labels_to_clusters, clusters_to_labels
from pymaptools.sample import discrete_sample, freqs2probas, randround
from pymaptools.io import GzipFileType, PathArgumentParser, write_json_line, read_json_lines, ndjson2col
from pymaptools.benchmark import PMTimer
from clustering_metrics.monte_carlo import utils
from clustering_metrics.fent import minmaxr
from clustering_metrics.utils import _div
from clustering_metrics.metrics import ClusteringMetrics, ConfusionMatrix2
from clustering_metrics.ranking import dist_auc
from clustering_metrics.skutils import auc
[docs]def parse_args(args=None):
parser = PathArgumentParser()
parser.add_argument(
'--logging', type=str, default='WARN', help="Logging level",
choices=[key for key in logging._levelNames.keys() if isinstance(key, str)])
subparsers = parser.add_subparsers()
p_mapper = subparsers.add_parser('mapper')
p_mapper.add_argument('--h0_err', type=float, default=1.0,
help='H0 error rate')
p_mapper.add_argument('--h1_err', type=float, default=0.5,
help='H1 error rate')
p_mapper.add_argument('--population_size', type=int, default=2000,
help='population size')
p_mapper.add_argument('--sim_size', type=int, default=1000,
help='Simulation size')
p_mapper.add_argument('--nclusters', type=int, default=20,
help='number of clusters to generate')
p_mapper.add_argument('--join_negatives', type=int, default=0,
help='whether to join negatives (if split_join<0)')
p_mapper.add_argument('--split_join', type=int, default=0,
help='number of splits (if positive) or joins (if negative) to perform')
p_mapper.add_argument('--sampling_warnings', type=int, default=0,
help='if true, show sampling warnings')
p_mapper.add_argument('--output', type=GzipFileType('w'),
default=sys.stdout, help='Output file')
p_mapper.add_argument('--metrics', type=str, required=True, nargs='*',
help='Which metrics to compute')
p_mapper.set_defaults(func=do_mapper)
p_reducer = subparsers.add_parser('reducer')
p_reducer.add_argument(
'--group_by', type=str, default=None,
help='Field to group by')
p_reducer.add_argument(
'--x_axis', type=str, default=None,
help='Which column to plot as X axis')
p_reducer.add_argument(
'--metrics', type=str, required=True, nargs='*',
help='Which metrics to compute')
p_reducer.add_argument(
'--input', type=GzipFileType('r'), default=sys.stdin, help='File input')
p_reducer.add_argument(
'--output', type=str, metavar='DIR', help='Output directory')
p_reducer.add_argument(
'--fig_title', type=str, default=None, help='Title (for figures generated)')
p_reducer.add_argument(
'--fig_format', type=str, default='svg', help='Figure format')
p_reducer.add_argument(
'--legend_loc', type=str, default='lower left',
help='legend location')
p_reducer.set_defaults(func=do_reducer)
namespace = parser.parse_args(args)
return namespace
[docs]def do_mapper(args):
params = dict(
n=args.sim_size,
nclusters=args.nclusters,
split_join=args.split_join,
join_negatives=bool(args.join_negatives),
population_size=args.population_size,
with_warnings=args.sampling_warnings,
)
h0 = Grid.with_sim_clusters(p_err=args.h0_err, **params)
h1 = Grid.with_sim_clusters(p_err=args.h1_err, **params)
with PMTimer() as timer:
results = h0.compare(h1, args.metrics)
for result in results:
result.update(timer.to_dict())
result.update(utils.serialize_args(args))
write_json_line(args.output, result)
[docs]def auc_xscaled(xs, ys):
"""AUC score scaled to fill x interval
"""
xmin, xmax = minmaxr(xs)
denom = float(xmax - xmin)
xs_corr = [(x - xmin) / denom for x in xs]
return auc(xs_corr, ys)
[docs]def create_plots(args, df):
import jinja2
import matplotlib.pyplot as plt
from palettable import colorbrewer
from matplotlib.font_manager import FontProperties
fontP = FontProperties()
fontP.set_size('xx-small')
#groups = df.set_index(args.x_axis).groupby([args.group_by])
groups = df.groupby([args.group_by])
metrics = list(set(args.metrics) & set(df.keys()))
colors = take(len(metrics), cycle(chain(
colorbrewer.qualitative.Dark2_8.mpl_colors,
colorbrewer.qualitative.Set2_8.mpl_colors,
)))
template_loader = jinja2.FileSystemLoader(os.path.join(args.output, '..'))
template_env = jinja2.Environment(loader=template_loader)
template_interactive = template_env.get_template('template_fig_interactive.html')
template_static = template_env.get_template('template_fig_static.html')
table_interactive = []
table_static = []
for group_name, group in groups:
# always sort by X values
group = group.sort([args.x_axis])
if args.fig_title is None:
fig_title = '%s=%s' % (args.group_by, group_name)
else:
fig_title = args.fig_title
# compute AUC scores
ys = []
for metric, color in zip(metrics, colors):
series = group[metric]
score = auc_xscaled(group[args.x_axis].values, series.values)
label = "%s (%.4f)" % (metric, score)
ys.append((score, metric, label, color))
ys.sort(reverse=True)
lbls_old, lbls_new, colors = zip(*ys)[1:4]
group = group[[args.x_axis] + list(lbls_old)] \
.set_index(args.x_axis) \
.rename(columns=dict(zip(lbls_old, lbls_new)))
# create plots
fig, ax = plt.subplots()
group.plot(ax=ax, title=fig_title, color=list(colors))
ax.set_xlim(*minmaxr(group.index.values))
ax.set_ylim(0.4, 1.0)
ax.legend(loc=args.legend_loc, prop=fontP)
fig_name = 'fig-%s.%s' % (group_name, args.fig_format)
fig_path = os.path.join(args.output, fig_name)
csv_name = 'fig-%s.csv' % group_name
csv_path = os.path.join(args.output, csv_name)
group.to_csv(csv_path)
table_interactive.append((
csv_name,
args.x_axis,
"%s=%s" % (args.group_by, group_name),
))
table_static.append(fig_name)
fig.savefig(fig_path, format=args.fig_format)
plt.close(fig)
with open(os.path.join(args.output, 'fig_interactive.html'), 'w') as fh:
fh.write(template_interactive.render(table=table_interactive))
with open(os.path.join(args.output, 'fig_static.html'), 'w') as fh:
fh.write(template_static.render(table=table_static))
[docs]def do_reducer(args):
import pandas as pd
obj = ndjson2col(read_json_lines(args.input))
df = pd.DataFrame.from_dict(obj)
csv_path = os.path.join(args.output, "summary.csv")
logging.info("Writing brief summary to %s", csv_path)
df.to_csv(csv_path)
create_plots(args, df)
[docs]def run(args):
logging.basicConfig(level=getattr(logging, args.logging))
args.func(args)
[docs]def get_conf(obj):
try:
return obj.pairwise
except AttributeError:
return obj
[docs]def sample_with_error(label, error_distribution, null_distribution):
"""Return label given error probability and null distributions
error_distribution must be of form {False: 1.0 - p_err, True: p_err}
"""
if discrete_sample(error_distribution):
# to generate error properly, draw from null distribution
return discrete_sample(null_distribution)
else:
# no error: append actual class label
return label
[docs]def relabel_negatives(clusters):
"""Place each negative label in its own class
"""
idx = -1
relabeled = []
for cluster in clusters:
relabeled_cluster = []
for class_label in cluster:
if class_label <= 0:
class_label = idx
relabeled_cluster.append(class_label)
idx -= 1
relabeled.append(relabeled_cluster)
return relabeled
[docs]def join_clusters(clusters):
"""Reduce number of clusters 2x by joining
"""
even = clusters[0::2]
odd = clusters[1::2]
if len(even) < len(odd):
even.append([])
elif len(even) > len(odd):
odd.append([])
assert len(even) == len(odd)
result = []
for c1, c2 in izip(even, odd):
result.append(c1 + c2)
return result
[docs]def split_clusters(clusters):
"""Increase number of clusters 2x by splitting
"""
result = []
for cluster in clusters:
even = cluster[0::2]
odd = cluster[1::2]
assert len(even) + len(odd) == len(cluster)
if even:
result.append(even)
if odd:
result.append(odd)
return result
[docs]def simulate_clustering(galpha=2, gbeta=10, nclusters=20, pos_ratio=0.2,
p_err=0.05, population_size=2000, split_join=0,
join_negatives=False, with_warnings=True):
if not 0.0 <= p_err <= 1.0:
raise ValueError(p_err)
csizes = map(randround, np.random.gamma(galpha, gbeta, nclusters))
# make sure at least one cluster is generated
num_pos = sum(csizes)
if num_pos == 0:
csizes.append(1)
num_pos += 1
num_neg = max(0, population_size - num_pos)
if with_warnings:
if not 0.0 <= pos_ratio <= 1.0:
raise ValueError(pos_ratio)
expected_num_neg = num_pos * _div(1.0 - pos_ratio, pos_ratio)
actual_neg_ratio = _div(num_neg - expected_num_neg, expected_num_neg)
if abs(actual_neg_ratio) > 0.2:
warnings.warn(
"{:.1%} {} negatives than expected. Got: {} "
"(expected: {}. Recommended population_size: {})"
.format(abs(actual_neg_ratio), ("fewer" if actual_neg_ratio < 0.0 else "more"), num_neg,
int(expected_num_neg), int(expected_num_neg + num_pos)))
# set up probability distributions we will use
null_dist = freqs2probas([num_neg] + csizes)
error_dist = {False: 1.0 - p_err, True: p_err}
# negative case first
negatives = []
for _ in xrange(num_neg):
class_label = sample_with_error(0, error_dist, null_dist)
negatives.append([class_label])
# positive cases
positives = []
for idx, csize in enumerate(csizes, start=1):
if csize < 1:
continue
cluster = []
for _ in xrange(csize):
class_label = sample_with_error(idx, error_dist, null_dist)
cluster.append(class_label)
positives.append(cluster)
if split_join > 0:
for _ in xrange(split_join):
positives = split_clusters(positives)
elif split_join < 0:
for _ in xrange(-split_join):
positives = join_clusters(positives)
if join_negatives:
for _ in xrange(-split_join):
negatives = join_clusters(negatives)
return relabel_negatives(positives + negatives)
[docs]def simulate_labeling(sample_size=2000, **kwargs):
clusters = simulate_clustering(**kwargs)
tuples = zip(*clusters_to_labels(clusters))
random.shuffle(tuples)
tuples = tuples[:sample_size]
ltrue, lpred = zip(*tuples) or ([], [])
return ltrue, lpred
[docs]class Grid(object):
def __init__(self, seed=None):
if seed is not None:
np.random.seed(seed)
self.max_classes = None
self.max_counts = None
self.n = None
self.size = None
self.grid = None
self.grid_type = None
self.get_matrix = None
self.show_record = None
@classmethod
[docs] def with_sim_clusters(cls, n=1000, size=200, seed=None, **kwargs):
obj = cls(seed=seed)
obj.grid = obj.fill_sim_clusters(size=size, n=n, **kwargs)
obj.grid_type = 'sim_clusters'
obj.get_matrix = obj.matrix_from_labels
obj.show_record = obj.show_cluster
return obj
@classmethod
[docs] def with_clusters(cls, n=1000, size=200, max_classes=5, seed=None):
obj = cls(seed=seed)
obj.grid = obj.fill_clusters(max_classes=max_classes, size=size, n=n)
obj.grid_type = 'clusters'
obj.get_matrix = obj.matrix_from_labels
obj.show_record = obj.show_cluster
return obj
@classmethod
[docs] def with_matrices(cls, n=1000, max_counts=100, seed=None):
obj = cls(seed=seed)
obj.grid = obj.fill_matrices(max_counts=max_counts, n=n)
obj.grid_type = 'matrices'
obj.get_matrix = obj.matrix_from_matrices
obj.show_record = obj.show_matrix
return obj
[docs] def show_matrix(self, idx, inverse=False):
grid = self.grid
return grid[0][idx]
[docs] def show_cluster(self, idx, inverse=False):
grid = self.grid
a, b = (1, 0) if inverse else (0, 1)
return labels_to_clusters(grid[a][idx], grid[b][idx])
[docs] def best_clustering_by_score(self, score, flip_sign=False):
idx, val = self.find_highest(score, flip_sign)
return {"idx": idx,
"found": "%s = %.4f" % (score, val),
"result": self.show_cluster(idx),
"inverse": self.show_cluster(idx, inverse=True)}
@staticmethod
[docs] def matrix_from_labels(*args):
ltrue, lpred = args
return ClusteringMetrics.from_labels(ltrue, lpred)
@staticmethod
[docs] def matrix_from_matrices(*args):
arr = args[0]
return ConfusionMatrix2.from_ccw(*arr)
[docs] def iter_grid(self):
return enumerate(izip(*self.grid))
iter_clusters = iter_grid
[docs] def iter_matrices(self):
if self.grid_type in ['matrices']:
for idx, tup in self.iter_grid():
yield idx, self.matrix_from_matrices(*tup)
elif self.grid_type in ['clusters', 'sim_clusters']:
for idx, labels in self.iter_grid():
yield idx, self.matrix_from_labels(*labels)
[docs] def describe_matrices(self):
for idx, matrix in self.iter_matrices():
tup = tuple(get_conf(matrix).to_ccw())
max_idx = tup.index(max(tup))
if max_idx != 2:
print idx, tup
[docs] def fill_clusters(self, n=None, size=None, max_classes=None):
if n is None:
n = self.n
else:
self.n = n
if size is None:
size = self.size
else:
self.size = size
if max_classes is None:
max_classes = self.max_classes
else:
self.max_classes = max_classes
classes = np.random.randint(
low=0, high=max_classes, size=(n, size))
clusters = np.random.randint(
low=0, high=max_classes, size=(n, size))
return classes, clusters
[docs] def fill_sim_clusters(self, n=None, size=None, **kwargs):
if n is None:
n = self.n
else:
self.n = n
if size is None:
size = self.size
else:
self.size = size
classes = np.empty((n, size), dtype=np.int64)
clusters = np.empty((n, size), dtype=np.int64)
for idx in xrange(n):
ltrue, lpred = simulate_labeling(sample_size=size, **kwargs)
classes[idx, :] = ltrue
clusters[idx, :] = lpred
return classes, clusters
[docs] def fill_matrices(self, max_counts=None, n=None):
if max_counts is None:
max_counts = self.max_counts
else:
self.max_counts = max_counts
if n is None:
n = self.n
else:
self.n = n
matrices = np.random.randint(
low=0, high=max_counts, size=(n, 4))
return (matrices,)
[docs] def find_highest(self, score, flip_sign=False):
best_index = -1
if flip_sign:
direction = 1
curr_score = float('inf')
else:
direction = -1
curr_score = float('-inf')
for idx, conf in self.iter_matrices():
new_score = conf.get_score(score)
if cmp(curr_score, new_score) == direction:
best_index = idx
curr_score = new_score
return (best_index, curr_score)
[docs] def find_matching_matrix(self, matches):
for idx, mx in self.iter_matrices():
mx = get_conf(mx)
if matches(mx):
return idx, mx
[docs] def compute(self, scores, show_progress=False, dtype=np.float16):
result = defaultdict(partial(np.empty, (self.n,), dtype=dtype))
if not isiterable(scores):
scores = [scores]
for idx, conf in self.iter_matrices():
if show_progress:
pct_done = 100 * idx / float(self.n)
if pct_done % 5 == 0:
sys.stderr.write("%d%% done\n" % pct_done)
for score in scores:
score_arr = conf.get_score(score)
if isiterable(score_arr):
for j, val in enumerate(score_arr):
result["%s-%d" % (score, j)][idx] = val
else:
result[score][idx] = score_arr
return result
[docs] def compare(self, others, scores, dtype=np.float16, plot=False):
result0 = self.compute(scores, dtype=dtype)
if not isiterable(others):
others = [others]
result_grid = []
for other in others:
result1 = other.compute(scores, dtype=dtype)
if plot:
from matplotlib import pyplot as plt
from palettable import colorbrewer
colors = colorbrewer.get_map('Set1', 'qualitative', 9).mpl_colors
result_row = {}
for score_name, scores0 in result0.iteritems():
scores1 = result1[score_name]
auc_score = dist_auc(scores0, scores1)
result_row[score_name] = auc_score
if plot:
scores0p = [x for x in scores0 if not np.isnan(x)]
scores1p = [x for x in scores1 if not np.isnan(x)]
hmin0, hmax0 = minmaxr(scores0p)
hmin1, hmax1 = minmaxr(scores1p)
bins = np.linspace(min(hmin0, hmin1), max(hmax0, hmax1), 50)
plt.hist(scores0p, bins, alpha=0.5, label='0', color=colors[0], edgecolor="none")
plt.hist(scores1p, bins, alpha=0.5, label='1', color=colors[1], edgecolor="none")
plt.legend(loc='upper right')
plt.title("%s: AUC=%.4f" % (score_name, auc_score))
plt.show()
result_grid.append(result_row)
return result_grid
[docs] def corrplot(self, compute_result, save_to, symmetric=False, **kwargs):
items = compute_result.items()
if not os.path.exists(save_to):
os.mkdir(save_to)
elif not os.path.isdir(save_to):
raise IOError("save_to already exists and is a file")
seen_pairs = set()
for (lbl1, arr1), (lbl2, arr2) in product(items, items):
if lbl1 == lbl2:
continue
elif (not symmetric) and (lbl2, lbl1) in seen_pairs:
continue
elif (not symmetric) and (lbl1, lbl2) in seen_pairs:
continue
figtitle = "%s vs. %s" % (lbl1, lbl2)
filename = "%s_vs_%s.png" % (lbl1, lbl2)
filepath = os.path.join(save_to, filename)
if os.path.exists(filepath):
warnings.warn("File exists: not overwriting %s" % filepath)
seen_pairs.add((lbl1, lbl2))
seen_pairs.add((lbl2, lbl1))
continue
self.plot([(arr1, arr2)], save_to=filepath, title=figtitle,
xlabel=lbl1, ylabel=lbl2, **kwargs)
seen_pairs.add((lbl1, lbl2))
seen_pairs.add((lbl2, lbl1))
@staticmethod
[docs] def plot(pairs, xlim=None, ylim=None, title=None, dither=0.0002,
marker='.', s=0.01, color='black', alpha=1.0, save_to=None,
label=None, xlabel=None, ylabel=None, **kwargs):
from matplotlib import pyplot as plt
fig, ax = plt.subplots()
for (xs, ys), dither_, marker_, s_, color_, label_, alpha_ in \
izip_with_cycles(pairs, dither, marker, s, color, label, alpha):
rho0 = scipy.stats.spearmanr(xs, ys)[0]
rho1 = scipy.stats.spearmanr(ys, xs)[0]
if not np.isclose(rho0, rho1):
# should never happen
raise RuntimeError("Error calculating Spearman's rho")
ax.annotate('$\\rho=%.3f$' % rho0, (0.05, 0.9), xycoords='axes fraction')
if dither_ is not None:
xs = np.random.normal(xs, dither_)
ys = np.random.normal(ys, dither_)
ax.scatter(xs, ys, marker=marker_, s=s_, color=color_,
alpha=alpha_, label=label_, **kwargs)
if label:
legend = ax.legend(loc='upper left', markerscale=80, scatterpoints=1)
for lbl in legend.get_texts():
lbl.set_fontsize('small')
if xlabel is not None:
ax.set_xlabel(xlabel)
if ylabel is not None:
ax.set_ylabel(ylabel)
if xlim is not None:
ax.set_xlim(xlim)
if ylim is not None:
ax.set_ylim(ylim)
if title is not None:
ax.set_title(title)
if save_to is None:
fig.show()
else:
fig.savefig(save_to)
plt.close(fig)
if __name__ == "__main__":
run(parse_args())