2 * Copyright (c) 2004, the JUNG Project and the Regents of the University
6 * This software is open-source under the BSD license; see either
8 * http://jung.sourceforge.net/license.txt for a description.
10 * Created on Aug 12, 2004
12 package edu.uci.ics.jung.algorithms.cluster;
14 import edu.uci.ics.jung.algorithms.scoring.VoltageScorer;
15 import edu.uci.ics.jung.algorithms.util.DiscreteDistribution;
16 import edu.uci.ics.jung.algorithms.util.KMeansClusterer;
17 import edu.uci.ics.jung.algorithms.util.KMeansClusterer.NotEnoughClustersException;
18 import edu.uci.ics.jung.graph.Graph;
20 import java.util.ArrayList;
21 import java.util.Collection;
22 import java.util.Collections;
23 import java.util.Comparator;
24 import java.util.HashMap;
25 import java.util.HashSet;
26 import java.util.Iterator;
27 import java.util.LinkedList;
28 import java.util.List;
30 import java.util.Random;
34 * <p>Clusters vertices of a <code>Graph</code> based on their ranks as
35 * calculated by <code>VoltageScorer</code>. This algorithm is based on,
36 * but not identical with, the method described in the paper below.
37 * The primary difference is that Wu and Huberman assume a priori that the clusters
38 * are of approximately the same size, and therefore use a more complex
39 * method than k-means (which is used here) for determining cluster
40 * membership based on co-occurrence data.</p>
42 * <p>The algorithm proceeds as follows:
44 * <li/>first, generate a set of candidate clusters as follows:
46 * <li/>pick (widely separated) vertex pair, run VoltageScorer
47 * <li/>group the vertices in two clusters according to their voltages
48 * <li/>store resulting candidate clusters
50 * <li/>second, generate k-1 clusters as follows:
52 * <li/>pick a vertex v as a cluster 'seed'
53 * <br>(Wu/Huberman: most frequent vertex in candidate clusters)
54 * <li/>calculate co-occurrence over all candidate clusters of v with each other
56 * <li/>separate co-occurrence counts into high/low;
57 * high vertices constitute a cluster
58 * <li/>remove v's vertices from candidate clusters; continue
60 * <li/>finally, remaining unassigned vertices are assigned to the kth ("garbage")
64 * <p><b>NOTE</b>: Depending on how the co-occurrence data splits the data into
65 * clusters, the number of clusters returned by this algorithm may be less than the
66 * number of clusters requested. The number of clusters will never be more than
67 * the number requested, however.</p>
69 * @author Joshua O'Madadhain
70 * @see "'Finding communities in linear time: a physics approach', Fang Wu and Bernardo Huberman, http://www.hpl.hp.com/research/idl/papers/linear/"
72 * @see KMeansClusterer
74 public class VoltageClusterer<V,E>
76 protected int num_candidates;
77 protected KMeansClusterer<V> kmc;
78 protected Random rand;
79 protected Graph<V,E> g;
82 * Creates an instance of a VoltageCluster with the specified parameters.
83 * These are mostly parameters that are passed directly to VoltageScorer
84 * and KMeansClusterer.
86 * @param num_candidates the number of candidate clusters to create
88 public VoltageClusterer(Graph<V,E> g, int num_candidates)
90 if (num_candidates < 1)
91 throw new IllegalArgumentException("must generate >=1 candidates");
93 this.num_candidates = num_candidates;
94 this.kmc = new KMeansClusterer<V>();
99 protected void setRandomSeed(int random_seed)
101 rand = new Random(random_seed);
105 * Returns a community (cluster) centered around <code>v</code>.
106 * @param v the vertex whose community we wish to discover
108 public Collection<Set<V>> getCommunity(V v)
110 return cluster_internal(v, 2);
114 * Clusters the vertices of <code>g</code> into
115 * <code>num_clusters</code> clusters, based on their connectivity.
116 * @param num_clusters the number of clusters to identify
118 public Collection<Set<V>> cluster(int num_clusters)
120 return cluster_internal(null, num_clusters);
124 * Does the work of <code>getCommunity</code> and <code>cluster</code>.
125 * @param origin the vertex around which clustering is to be done
126 * @param num_clusters the (maximum) number of clusters to find
128 protected Collection<Set<V>> cluster_internal(V origin, int num_clusters)
130 // generate candidate clusters
131 // repeat the following 'samples' times:
132 // * pick (widely separated) vertex pair, run VoltageScorer
133 // * use k-means to identify 2 communities in ranked graph
134 // * store resulting candidate communities
135 ArrayList<V> v_array = new ArrayList<V>(g.getVertices());
137 LinkedList<Set<V>> candidates = new LinkedList<Set<V>>();
139 for (int j = 0; j < num_candidates; j++)
143 source = v_array.get((int)(rand.nextDouble() * v_array.size()));
149 target = v_array.get((int)(rand.nextDouble() * v_array.size()));
151 while (source == target);
152 VoltageScorer<V,E> vs = new VoltageScorer<V,E>(g, source, target);
155 Map<V, double[]> voltage_ranks = new HashMap<V, double[]>();
156 for (V v : g.getVertices())
157 voltage_ranks.put(v, new double[] {vs.getVertexScore(v)});
159 // addOneCandidateCluster(candidates, voltage_ranks);
160 addTwoCandidateClusters(candidates, voltage_ranks);
163 // repeat the following k-1 times:
164 // * pick a vertex v as a cluster seed
165 // (Wu/Huberman: most frequent vertex in candidates)
166 // * calculate co-occurrence (in candidate clusters)
167 // of this vertex with all others
168 // * use k-means to separate co-occurrence counts into high/low;
169 // high vertices are a cluster
170 // * remove v's vertices from candidate clusters
172 Collection<Set<V>> clusters = new LinkedList<Set<V>>();
173 Set<V> remaining = new HashSet<V>(g.getVertices());
175 List<V> seed_candidates = getSeedCandidates(candidates);
178 for (int j = 0; j < (num_clusters - 1); j++)
180 if (remaining.isEmpty())
184 if (seed_index == 0 && origin != null)
188 do { seed = seed_candidates.get(seed_index++); }
189 while (!remaining.contains(seed));
192 Map<V, double[]> occur_counts = getObjectCounts(candidates, seed);
193 if (occur_counts.size() < 2)
196 // now that we have the counts, cluster them...
199 Collection<Map<V, double[]>> high_low = kmc.cluster(occur_counts, 2);
200 // ...get the cluster with the highest-valued centroid...
201 Iterator<Map<V, double[]>> h_iter = high_low.iterator();
202 Map<V, double[]> cluster1 = h_iter.next();
203 Map<V, double[]> cluster2 = h_iter.next();
204 double[] centroid1 = DiscreteDistribution.mean(cluster1.values());
205 double[] centroid2 = DiscreteDistribution.mean(cluster2.values());
207 if (centroid1[0] >= centroid2[0])
208 new_cluster = cluster1.keySet();
210 new_cluster = cluster2.keySet();
212 // ...remove the elements of new_cluster from each candidate...
213 for (Set<V> cluster : candidates)
214 cluster.removeAll(new_cluster);
215 clusters.add(new_cluster);
216 remaining.removeAll(new_cluster);
218 catch (NotEnoughClustersException nece)
220 // all remaining vertices are in the same cluster
225 // identify remaining vertices (if any) as a 'garbage' cluster
226 if (!remaining.isEmpty())
227 clusters.add(remaining);
233 * Do k-means with three intervals and pick the
234 * smaller two clusters (presumed to be on the ends); this is closer to the Wu-Huberman method.
236 * @param voltage_ranks
238 protected void addTwoCandidateClusters(LinkedList<Set<V>> candidates,
239 Map<V, double[]> voltage_ranks)
243 List<Map<V, double[]>> clusters = new ArrayList<Map<V, double[]>>(kmc.cluster(voltage_ranks, 3));
244 boolean b01 = clusters.get(0).size() > clusters.get(1).size();
245 boolean b02 = clusters.get(0).size() > clusters.get(2).size();
246 boolean b12 = clusters.get(1).size() > clusters.get(2).size();
249 candidates.add(clusters.get(1).keySet());
250 candidates.add(clusters.get(2).keySet());
252 else if (!b01 && b12)
254 candidates.add(clusters.get(0).keySet());
255 candidates.add(clusters.get(2).keySet());
257 else if (!b02 && !b12)
259 candidates.add(clusters.get(0).keySet());
260 candidates.add(clusters.get(1).keySet());
263 catch (NotEnoughClustersException e)
265 // no valid candidates, continue
270 * alternative to addTwoCandidateClusters(): cluster vertices by voltages into 2 clusters.
271 * We only consider the smaller of the two clusters returned
272 * by k-means to be a 'true' cluster candidate; the other is a garbage cluster.
274 * @param voltage_ranks
276 protected void addOneCandidateCluster(LinkedList<Set<V>> candidates,
277 Map<V, double[]> voltage_ranks)
281 List<Map<V, double[]>> clusters;
282 clusters = new ArrayList<Map<V, double[]>>(kmc.cluster(voltage_ranks, 2));
283 if (clusters.get(0).size() < clusters.get(1).size())
284 candidates.add(clusters.get(0).keySet());
286 candidates.add(clusters.get(1).keySet());
288 catch (NotEnoughClustersException e)
290 // no valid candidates, continue
295 * Returns an array of cluster seeds, ranked in decreasing order
296 * of number of appearances in the specified collection of candidate
300 protected List<V> getSeedCandidates(Collection<Set<V>> candidates)
302 final Map<V, double[]> occur_counts = getObjectCounts(candidates, null);
304 ArrayList<V> occurrences = new ArrayList<V>(occur_counts.keySet());
305 Collections.sort(occurrences, new MapValueArrayComparator(occur_counts));
307 System.out.println("occurrences: ");
308 for (int i = 0; i < occurrences.size(); i++)
309 System.out.println(occur_counts.get(occurrences.get(i))[0]);
314 protected Map<V, double[]> getObjectCounts(Collection<Set<V>> candidates, V seed)
316 Map<V, double[]> occur_counts = new HashMap<V, double[]>();
317 for (V v : g.getVertices())
318 occur_counts.put(v, new double[]{0});
320 for (Set<V> candidate : candidates)
323 System.out.println(candidate.size());
324 if (seed == null || candidate.contains(seed))
326 for (V element : candidate)
328 double[] count = occur_counts.get(element);
336 System.out.println("occur_counts size: " + occur_counts.size());
337 for (V v : occur_counts.keySet())
338 System.out.println(occur_counts.get(v)[0]);
344 protected class MapValueArrayComparator implements Comparator<V>
346 private Map<V, double[]> map;
348 protected MapValueArrayComparator(Map<V, double[]> map)
353 public int compare(V o1, V o2)
355 double[] count0 = map.get(o1);
356 double[] count1 = map.get(o2);
357 if (count0[0] < count1[0])
359 else if (count0[0] > count1[0])