/* * Copyright (c) 2003, the JUNG Project and the Regents of the University * of California * All rights reserved. * * This software is open-source under the BSD license; see either * "license.txt" or * http://jung.sourceforge.net/license.txt for a description. * * Created on Feb 18, 2004 */ package edu.uci.ics.jung.algorithms.util; import java.util.Collection; import java.util.Iterator; /** * A utility class for calculating properties of discrete distributions. * Generally, these distributions are represented as arrays of * double values, which are assumed to be normalized * such that the entries in a single array sum to 1. * * @author Joshua O'Madadhain */ public class DiscreteDistribution { /** * Returns the Kullback-Leibler divergence between the * two specified distributions, which must have the same * number of elements. This is defined as * the sum over all i of * dist[i] * Math.log(dist[i] / reference[i]). * Note that this value is not symmetric; see * symmetricKL for a symmetric variant. * @see #symmetricKL(double[], double[]) */ public static double KullbackLeibler(double[] dist, double[] reference) { double distance = 0; checkLengths(dist, reference); for (int i = 0; i < dist.length; i++) { if (dist[i] > 0 && reference[i] > 0) distance += dist[i] * Math.log(dist[i] / reference[i]); } return distance; } /** * Returns KullbackLeibler(dist, reference) + KullbackLeibler(reference, dist). * @see #KullbackLeibler(double[], double[]) */ public static double symmetricKL(double[] dist, double[] reference) { return KullbackLeibler(dist, reference) + KullbackLeibler(reference, dist); } /** * Returns the squared difference between the * two specified distributions, which must have the same * number of elements. This is defined as * the sum over all i of the square of * (dist[i] - reference[i]). */ public static double squaredError(double[] dist, double[] reference) { double error = 0; checkLengths(dist, reference); for (int i = 0; i < dist.length; i++) { double difference = dist[i] - reference[i]; error += difference * difference; } return error; } /** * Returns the cosine distance between the two * specified distributions, which must have the same number * of elements. The distributions are treated as vectors * in dist.length-dimensional space. * Given the following definitions *

v = the sum over all i of dist[i] * dist[i] *
w = the sum over all i of reference[i] * reference[i] *
vw = the sum over all i of dist[i] * reference[i] *

* the value returned is defined as vw / (Math.sqrt(v) * Math.sqrt(w)). */ public static double cosine(double[] dist, double[] reference) { double v_prod = 0; // dot product x*x double w_prod = 0; // dot product y*y double vw_prod = 0; // dot product x*y checkLengths(dist, reference); for (int i = 0; i < dist.length; i++) { vw_prod += dist[i] * reference[i]; v_prod += dist[i] * dist[i]; w_prod += reference[i] * reference[i]; } // cosine distance between v and w return vw_prod / (Math.sqrt(v_prod) * Math.sqrt(w_prod)); } /** * Returns the entropy of this distribution. * High entropy indicates that the distribution is * close to uniform; low entropy indicates that the * distribution is close to a Dirac delta (i.e., if * the probability mass is concentrated at a single * point, this method returns 0). Entropy is defined as * the sum over all i of * -(dist[i] * Math.log(dist[i])) */ public static double entropy(double[] dist) { double total = 0; for (int i = 0; i < dist.length; i++) { if (dist[i] > 0) total += dist[i] * Math.log(dist[i]); } return -total; } /** * Throws an IllegalArgumentException if the two arrays are not of the same length. */ protected static void checkLengths(double[] dist, double[] reference) { if (dist.length != reference.length) throw new IllegalArgumentException("Arrays must be of the same length"); } /** * Normalizes, with Lagrangian smoothing, the specified double * array, so that the values sum to 1 (i.e., can be treated as probabilities). * The effect of the Lagrangian smoothing is to ensure that all entries * are nonzero; effectively, a value of alpha is added to each * entry in the original array prior to normalization. * @param counts * @param alpha */ public static void normalize(double[] counts, double alpha) { double total_count = 0; for (int i = 0; i < counts.length; i++) total_count += counts[i]; for (int i = 0; i < counts.length; i++) counts[i] = (counts[i] + alpha) / (total_count + counts.length * alpha); } /** * Returns the mean of the specified Collection of * distributions, which are assumed to be normalized arrays of * double values. * @see #mean(double[][]) */ public static double[] mean(Collection distributions) { if (distributions.isEmpty()) throw new IllegalArgumentException("Distribution collection must be non-empty"); Iterator iter = distributions.iterator(); double[] first = iter.next(); double[][] d_array = new double[distributions.size()][first.length]; d_array[0] = first; for (int i = 1; i < d_array.length; i++) d_array[i] = iter.next(); return mean(d_array); } /** * Returns the mean of the specified array of distributions, * represented as normalized arrays of double values. * Will throw an "index out of bounds" exception if the * distribution arrays are not all of the same length. */ public static double[] mean(double[][] distributions) { double[] d_mean = new double[distributions[0].length]; for (int j = 0; j < d_mean.length; j++) d_mean[j] = 0; for (int i = 0; i < distributions.length; i++) for (int j = 0; j < d_mean.length; j++) d_mean[j] += distributions[i][j] / distributions.length; return d_mean; } }