Source code for clustering_metrics.utils

# -*- coding: utf-8 -*-
import random
import operator
import string
from math import log
from itertools import imap
from operator import itemgetter
from pymaptools.iter import isiterable


PINF = float('inf')
NINF = float('-inf')
NAN = float('nan')


def _log(x, base=None):
    """Safe natural log
    """
    if x == 0.0:
        return NINF
    elif base is None:
        return log(x)
    else:
        return log(x, base)


def _div(numer, denom):
    """Safe division
    """
    if denom == 0.0:
        if numer == 0.0:
            return NAN
        elif numer > 0.0:
            return PINF
        else:
            return NINF
    return numer / float(denom)


[docs]def get_df_subset(df, fields): """Give a subset of a ``pandas.DataFrame`` instance """ subset_fields = [field for field in set(fields) if field in df] return df[subset_fields]
[docs]def fill_with_last(lst, k): """ extend a list to length k by duplicating last item >>> fill_with_last([1, 2, 3], 5) [1, 2, 3, 3, 3] """ len_l = len(lst) if len_l < k: lst.extend([lst[-1]] * (k - len_l)) return lst
[docs]def wrap_scalar(a): """If scalar, convert to tuple""" return a if isiterable(a) else (a,)
[docs]def tsorted(a): """Sort a tuple""" return tuple(sorted(a))
[docs]def getpropval(obj): """ :return: a generator of properties and their values """ return ((p, val) for p, val in ((p, getattr(obj, p)) for p in dir(obj)) if not callable(val) and p[0] != '_')
[docs]def gapply(n, func, *args, **kwargs): """Apply a generating function n times to the argument list :param n: number of times to apply a function :type n: integer :param func: a function to apply :type func: instancemethod :rtype: collections.iterable """ for _ in xrange(n): yield func(*args, **kwargs)
[docs]def lapply(n, func, *args, **kwargs): """Same as gapply, except returns a list :param n: number of times to apply a function :type n: integer :param func: a function to apply :type func: instancemethod :rtype: list """ return list(gapply(n, func, *args, **kwargs))
[docs]def randset(value_range=(0, 10), sample_range=(5, 20)): """Return a random set of integers sampled :returns: a list of integers :rtype: tuple """ n = random.choice(range(*sample_range)) source = range(*value_range) return tuple(sorted(set(gapply(n, random.choice, source))))
[docs]def random_string(length, alphabet=string.letters): """Generate a random string :param length: length of the string :type length: int :param alphabet: alphabet to draw letters from :type alphabet: str :return: random string of specified length :rtype: str """ return ''.join(str(random.choice(alphabet)) for _ in xrange(length))
[docs]def sigsim(x, y, dim): """Return the similarity of the two signatures :param x: signature 1 :type x: object :param y: signature 2 :type y: object :param dim: number of dimensions :type dim: int :returns: similarity between two signatures :rtype: float """ return sum(imap(operator.eq, x, y)) / float(dim)
[docs]def sort_by_length(els, reverse=True): """Given a list of els, sort its elements by len() in descending order. Returns a generator :param els: input list :type els: list :param reverse: Whether to reverse a list :type reverse: bool :rtype: collections.iterable """ return imap(itemgetter(0), sorted(((s, len(s)) for s in els), key=operator.itemgetter(1), reverse=reverse))