/*
* Copyright (c) 2004, the JUNG Project and the Regents of the University
* of California
* All rights reserved.
*
* This software is open-source under the BSD license; see either
* "license.txt" or
* http://jung.sourceforge.net/license.txt for a description.
*
* Created on Aug 12, 2004
*/
package edu.uci.ics.jung.algorithms.cluster;
import edu.uci.ics.jung.algorithms.scoring.VoltageScorer;
import edu.uci.ics.jung.algorithms.util.DiscreteDistribution;
import edu.uci.ics.jung.algorithms.util.KMeansClusterer;
import edu.uci.ics.jung.algorithms.util.KMeansClusterer.NotEnoughClustersException;
import edu.uci.ics.jung.graph.Graph;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
/**
*
Clusters vertices of a Graph
based on their ranks as
* calculated by VoltageScorer
. This algorithm is based on,
* but not identical with, the method described in the paper below.
* The primary difference is that Wu and Huberman assume a priori that the clusters
* are of approximately the same size, and therefore use a more complex
* method than k-means (which is used here) for determining cluster
* membership based on co-occurrence data.
*
* The algorithm proceeds as follows:
*
* first, generate a set of candidate clusters as follows:
*
* pick (widely separated) vertex pair, run VoltageScorer
* group the vertices in two clusters according to their voltages
* store resulting candidate clusters
*
* second, generate k-1 clusters as follows:
*
* pick a vertex v as a cluster 'seed'
*
(Wu/Huberman: most frequent vertex in candidate clusters)
* calculate co-occurrence over all candidate clusters of v with each other
* vertex
* separate co-occurrence counts into high/low;
* high vertices constitute a cluster
* remove v's vertices from candidate clusters; continue
*
* finally, remaining unassigned vertices are assigned to the kth ("garbage")
* cluster.
*
*
* NOTE: Depending on how the co-occurrence data splits the data into
* clusters, the number of clusters returned by this algorithm may be less than the
* number of clusters requested. The number of clusters will never be more than
* the number requested, however.
*
* @author Joshua O'Madadhain
* @see "'Finding communities in linear time: a physics approach', Fang Wu and Bernardo Huberman, http://www.hpl.hp.com/research/idl/papers/linear/"
* @see VoltageScorer
* @see KMeansClusterer
*/
public class VoltageClusterer
{
protected int num_candidates;
protected KMeansClusterer kmc;
protected Random rand;
protected Graph g;
/**
* Creates an instance of a VoltageCluster with the specified parameters.
* These are mostly parameters that are passed directly to VoltageScorer
* and KMeansClusterer.
*
* @param num_candidates the number of candidate clusters to create
*/
public VoltageClusterer(Graph g, int num_candidates)
{
if (num_candidates < 1)
throw new IllegalArgumentException("must generate >=1 candidates");
this.num_candidates = num_candidates;
this.kmc = new KMeansClusterer();
rand = new Random();
this.g = g;
}
protected void setRandomSeed(int random_seed)
{
rand = new Random(random_seed);
}
/**
* Returns a community (cluster) centered around v
.
* @param v the vertex whose community we wish to discover
*/
public Collection> getCommunity(V v)
{
return cluster_internal(v, 2);
}
/**
* Clusters the vertices of g
into
* num_clusters
clusters, based on their connectivity.
* @param num_clusters the number of clusters to identify
*/
public Collection> cluster(int num_clusters)
{
return cluster_internal(null, num_clusters);
}
/**
* Does the work of getCommunity
and cluster
.
* @param origin the vertex around which clustering is to be done
* @param num_clusters the (maximum) number of clusters to find
*/
protected Collection> cluster_internal(V origin, int num_clusters)
{
// generate candidate clusters
// repeat the following 'samples' times:
// * pick (widely separated) vertex pair, run VoltageScorer
// * use k-means to identify 2 communities in ranked graph
// * store resulting candidate communities
ArrayList v_array = new ArrayList(g.getVertices());
LinkedList> candidates = new LinkedList>();
for (int j = 0; j < num_candidates; j++)
{
V source;
if (origin == null)
source = v_array.get((int)(rand.nextDouble() * v_array.size()));
else
source = origin;
V target = null;
do
{
target = v_array.get((int)(rand.nextDouble() * v_array.size()));
}
while (source == target);
VoltageScorer vs = new VoltageScorer(g, source, target);
vs.evaluate();
Map voltage_ranks = new HashMap();
for (V v : g.getVertices())
voltage_ranks.put(v, new double[] {vs.getVertexScore(v)});
// addOneCandidateCluster(candidates, voltage_ranks);
addTwoCandidateClusters(candidates, voltage_ranks);
}
// repeat the following k-1 times:
// * pick a vertex v as a cluster seed
// (Wu/Huberman: most frequent vertex in candidates)
// * calculate co-occurrence (in candidate clusters)
// of this vertex with all others
// * use k-means to separate co-occurrence counts into high/low;
// high vertices are a cluster
// * remove v's vertices from candidate clusters
Collection> clusters = new LinkedList>();
Set remaining = new HashSet(g.getVertices());
List seed_candidates = getSeedCandidates(candidates);
int seed_index = 0;
for (int j = 0; j < (num_clusters - 1); j++)
{
if (remaining.isEmpty())
break;
V seed;
if (seed_index == 0 && origin != null)
seed = origin;
else
{
do { seed = seed_candidates.get(seed_index++); }
while (!remaining.contains(seed));
}
Map occur_counts = getObjectCounts(candidates, seed);
if (occur_counts.size() < 2)
break;
// now that we have the counts, cluster them...
try
{
Collection