/* * Copyright (c) 2004, the JUNG Project and the Regents of the University * of California * All rights reserved. * * This software is open-source under the BSD license; see either * "license.txt" or * http://jung.sourceforge.net/license.txt for a description. * * Created on Aug 12, 2004 */ package edu.uci.ics.jung.algorithms.cluster; import edu.uci.ics.jung.algorithms.scoring.VoltageScorer; import edu.uci.ics.jung.algorithms.util.DiscreteDistribution; import edu.uci.ics.jung.algorithms.util.KMeansClusterer; import edu.uci.ics.jung.algorithms.util.KMeansClusterer.NotEnoughClustersException; import edu.uci.ics.jung.graph.Graph; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; /** *

Clusters vertices of a Graph based on their ranks as * calculated by VoltageScorer. This algorithm is based on, * but not identical with, the method described in the paper below. * The primary difference is that Wu and Huberman assume a priori that the clusters * are of approximately the same size, and therefore use a more complex * method than k-means (which is used here) for determining cluster * membership based on co-occurrence data.

* *

The algorithm proceeds as follows: *

first, generate a set of candidate clusters as follows: *
- pick (widely separated) vertex pair, run VoltageScorer *
- group the vertices in two clusters according to their voltages *
- store resulting candidate clusters *
*
second, generate k-1 clusters as follows: *
- pick a vertex v as a cluster 'seed' *
  (Wu/Huberman: most frequent vertex in candidate clusters) *
- calculate co-occurrence over all candidate clusters of v with each other * vertex *
- separate co-occurrence counts into high/low; * high vertices constitute a cluster *
- remove v's vertices from candidate clusters; continue *
*
finally, remaining unassigned vertices are assigned to the kth ("garbage") * cluster. *

* *

NOTE: Depending on how the co-occurrence data splits the data into * clusters, the number of clusters returned by this algorithm may be less than the * number of clusters requested. The number of clusters will never be more than * the number requested, however.

* * @author Joshua O'Madadhain * @see "'Finding communities in linear time: a physics approach', Fang Wu and Bernardo Huberman, http://www.hpl.hp.com/research/idl/papers/linear/" * @see VoltageScorer * @see KMeansClusterer */ public class VoltageClusterer { protected int num_candidates; protected KMeansClusterer kmc; protected Random rand; protected Graph g; /** * Creates an instance of a VoltageCluster with the specified parameters. * These are mostly parameters that are passed directly to VoltageScorer * and KMeansClusterer. * * @param num_candidates the number of candidate clusters to create */ public VoltageClusterer(Graph g, int num_candidates) { if (num_candidates < 1) throw new IllegalArgumentException("must generate >=1 candidates"); this.num_candidates = num_candidates; this.kmc = new KMeansClusterer(); rand = new Random(); this.g = g; } protected void setRandomSeed(int random_seed) { rand = new Random(random_seed); } /** * Returns a community (cluster) centered around v. * @param v the vertex whose community we wish to discover */ public Collection> getCommunity(V v) { return cluster_internal(v, 2); } /** * Clusters the vertices of g into * num_clusters clusters, based on their connectivity. * @param num_clusters the number of clusters to identify */ public Collection> cluster(int num_clusters) { return cluster_internal(null, num_clusters); } /** * Does the work of getCommunity and cluster. * @param origin the vertex around which clustering is to be done * @param num_clusters the (maximum) number of clusters to find */ protected Collection> cluster_internal(V origin, int num_clusters) { // generate candidate clusters // repeat the following 'samples' times: // * pick (widely separated) vertex pair, run VoltageScorer // * use k-means to identify 2 communities in ranked graph // * store resulting candidate communities ArrayList v_array = new ArrayList(g.getVertices()); LinkedList> candidates = new LinkedList>(); for (int j = 0; j < num_candidates; j++) { V source; if (origin == null) source = v_array.get((int)(rand.nextDouble() * v_array.size())); else source = origin; V target = null; do { target = v_array.get((int)(rand.nextDouble() * v_array.size())); } while (source == target); VoltageScorer vs = new VoltageScorer(g, source, target); vs.evaluate(); Map voltage_ranks = new HashMap(); for (V v : g.getVertices()) voltage_ranks.put(v, new double[] {vs.getVertexScore(v)}); // addOneCandidateCluster(candidates, voltage_ranks); addTwoCandidateClusters(candidates, voltage_ranks); } // repeat the following k-1 times: // * pick a vertex v as a cluster seed // (Wu/Huberman: most frequent vertex in candidates) // * calculate co-occurrence (in candidate clusters) // of this vertex with all others // * use k-means to separate co-occurrence counts into high/low; // high vertices are a cluster // * remove v's vertices from candidate clusters Collection> clusters = new LinkedList>(); Set remaining = new HashSet(g.getVertices()); List seed_candidates = getSeedCandidates(candidates); int seed_index = 0; for (int j = 0; j < (num_clusters - 1); j++) { if (remaining.isEmpty()) break; V seed; if (seed_index == 0 && origin != null) seed = origin; else { do { seed = seed_candidates.get(seed_index++); } while (!remaining.contains(seed)); } Map occur_counts = getObjectCounts(candidates, seed); if (occur_counts.size() < 2) break; // now that we have the counts, cluster them... try { Collection> high_low = kmc.cluster(occur_counts, 2); // ...get the cluster with the highest-valued centroid... Iterator> h_iter = high_low.iterator(); Map cluster1 = h_iter.next(); Map cluster2 = h_iter.next(); double[] centroid1 = DiscreteDistribution.mean(cluster1.values()); double[] centroid2 = DiscreteDistribution.mean(cluster2.values()); Set new_cluster; if (centroid1[0] >= centroid2[0]) new_cluster = cluster1.keySet(); else new_cluster = cluster2.keySet(); // ...remove the elements of new_cluster from each candidate... for (Set cluster : candidates) cluster.removeAll(new_cluster); clusters.add(new_cluster); remaining.removeAll(new_cluster); } catch (NotEnoughClustersException nece) { // all remaining vertices are in the same cluster break; } } // identify remaining vertices (if any) as a 'garbage' cluster if (!remaining.isEmpty()) clusters.add(remaining); return clusters; } /** * Do k-means with three intervals and pick the * smaller two clusters (presumed to be on the ends); this is closer to the Wu-Huberman method. * @param candidates * @param voltage_ranks */ protected void addTwoCandidateClusters(LinkedList> candidates, Map voltage_ranks) { try { List> clusters = new ArrayList>(kmc.cluster(voltage_ranks, 3)); boolean b01 = clusters.get(0).size() > clusters.get(1).size(); boolean b02 = clusters.get(0).size() > clusters.get(2).size(); boolean b12 = clusters.get(1).size() > clusters.get(2).size(); if (b01 && b02) { candidates.add(clusters.get(1).keySet()); candidates.add(clusters.get(2).keySet()); } else if (!b01 && b12) { candidates.add(clusters.get(0).keySet()); candidates.add(clusters.get(2).keySet()); } else if (!b02 && !b12) { candidates.add(clusters.get(0).keySet()); candidates.add(clusters.get(1).keySet()); } } catch (NotEnoughClustersException e) { // no valid candidates, continue } } /** * alternative to addTwoCandidateClusters(): cluster vertices by voltages into 2 clusters. * We only consider the smaller of the two clusters returned * by k-means to be a 'true' cluster candidate; the other is a garbage cluster. * @param candidates * @param voltage_ranks */ protected void addOneCandidateCluster(LinkedList> candidates, Map voltage_ranks) { try { List> clusters; clusters = new ArrayList>(kmc.cluster(voltage_ranks, 2)); if (clusters.get(0).size() < clusters.get(1).size()) candidates.add(clusters.get(0).keySet()); else candidates.add(clusters.get(1).keySet()); } catch (NotEnoughClustersException e) { // no valid candidates, continue } } /** * Returns an array of cluster seeds, ranked in decreasing order * of number of appearances in the specified collection of candidate * clusters. * @param candidates */ protected List getSeedCandidates(Collection> candidates) { final Map occur_counts = getObjectCounts(candidates, null); ArrayList occurrences = new ArrayList(occur_counts.keySet()); Collections.sort(occurrences, new MapValueArrayComparator(occur_counts)); System.out.println("occurrences: "); for (int i = 0; i < occurrences.size(); i++) System.out.println(occur_counts.get(occurrences.get(i))[0]); return occurrences; } protected Map getObjectCounts(Collection> candidates, V seed) { Map occur_counts = new HashMap(); for (V v : g.getVertices()) occur_counts.put(v, new double[]{0}); for (Set candidate : candidates) { if (seed == null) System.out.println(candidate.size()); if (seed == null || candidate.contains(seed)) { for (V element : candidate) { double[] count = occur_counts.get(element); count[0]++; } } } if (seed == null) { System.out.println("occur_counts size: " + occur_counts.size()); for (V v : occur_counts.keySet()) System.out.println(occur_counts.get(v)[0]); } return occur_counts; } protected class MapValueArrayComparator implements Comparator { private Map map; protected MapValueArrayComparator(Map map) { this.map = map; } public int compare(V o1, V o2) { double[] count0 = map.get(o1); double[] count1 = map.get(o2); if (count0[0] < count1[0]) return 1; else if (count0[0] > count1[0]) return -1; return 0; } } }