Initial opendaylight infrastructure commit!!
[controller.git] / third-party / net.sf.jung2 / src / main / java / edu / uci / ics / jung / algorithms / cluster / VoltageClusterer.java
1 /*
2  * Copyright (c) 2004, the JUNG Project and the Regents of the University
3  * of California
4  * All rights reserved.
5  *
6  * This software is open-source under the BSD license; see either
7  * "license.txt" or
8  * http://jung.sourceforge.net/license.txt for a description.
9  *
10  * Created on Aug 12, 2004
11  */
12 package edu.uci.ics.jung.algorithms.cluster;
13
14 import edu.uci.ics.jung.algorithms.scoring.VoltageScorer;
15 import edu.uci.ics.jung.algorithms.util.DiscreteDistribution;
16 import edu.uci.ics.jung.algorithms.util.KMeansClusterer;
17 import edu.uci.ics.jung.algorithms.util.KMeansClusterer.NotEnoughClustersException;
18 import edu.uci.ics.jung.graph.Graph;
19
20 import java.util.ArrayList;
21 import java.util.Collection;
22 import java.util.Collections;
23 import java.util.Comparator;
24 import java.util.HashMap;
25 import java.util.HashSet;
26 import java.util.Iterator;
27 import java.util.LinkedList;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.Random;
31 import java.util.Set;
32
33 /**
34  * <p>Clusters vertices of a <code>Graph</code> based on their ranks as
35  * calculated by <code>VoltageScorer</code>.  This algorithm is based on,
36  * but not identical with, the method described in the paper below.
37  * The primary difference is that Wu and Huberman assume a priori that the clusters
38  * are of approximately the same size, and therefore use a more complex
39  * method than k-means (which is used here) for determining cluster
40  * membership based on co-occurrence data.</p>
41  *
42  * <p>The algorithm proceeds as follows:
43  * <ul>
44  * <li/>first, generate a set of candidate clusters as follows:
45  *      <ul>
46  *      <li/>pick (widely separated) vertex pair, run VoltageScorer
47  *      <li/>group the vertices in two clusters according to their voltages
48  *      <li/>store resulting candidate clusters
49  *      </ul>
50  * <li/>second, generate k-1 clusters as follows:
51  *      <ul>
52  *      <li/>pick a vertex v as a cluster 'seed'
53  *           <br>(Wu/Huberman: most frequent vertex in candidate clusters)
54  *      <li/>calculate co-occurrence over all candidate clusters of v with each other
55  *           vertex
56  *      <li/>separate co-occurrence counts into high/low;
57  *           high vertices constitute a cluster
58  *      <li/>remove v's vertices from candidate clusters; continue
59  *      </ul>
60  * <li/>finally, remaining unassigned vertices are assigned to the kth ("garbage")
61  * cluster.
62  * </ul></p>
63  *
64  * <p><b>NOTE</b>: Depending on how the co-occurrence data splits the data into
65  * clusters, the number of clusters returned by this algorithm may be less than the
66  * number of clusters requested.  The number of clusters will never be more than
67  * the number requested, however.</p>
68  *
69  * @author Joshua O'Madadhain
70  * @see "'Finding communities in linear time: a physics approach', Fang Wu and Bernardo Huberman, http://www.hpl.hp.com/research/idl/papers/linear/"
71  * @see VoltageScorer
72  * @see KMeansClusterer
73  */
74 public class VoltageClusterer<V,E>
75 {
76     protected int num_candidates;
77     protected KMeansClusterer<V> kmc;
78     protected Random rand;
79     protected Graph<V,E> g;
80
81     /**
82      * Creates an instance of a VoltageCluster with the specified parameters.
83      * These are mostly parameters that are passed directly to VoltageScorer
84      * and KMeansClusterer.
85      *
86      * @param num_candidates    the number of candidate clusters to create
87      */
88     public VoltageClusterer(Graph<V,E> g, int num_candidates)
89     {
90         if (num_candidates < 1)
91             throw new IllegalArgumentException("must generate >=1 candidates");
92
93         this.num_candidates = num_candidates;
94         this.kmc = new KMeansClusterer<V>();
95         rand = new Random();
96         this.g = g;
97     }
98
99     protected void setRandomSeed(int random_seed)
100     {
101         rand = new Random(random_seed);
102     }
103
104     /**
105      * Returns a community (cluster) centered around <code>v</code>.
106      * @param v the vertex whose community we wish to discover
107      */
108     public Collection<Set<V>> getCommunity(V v)
109     {
110         return cluster_internal(v, 2);
111     }
112
113     /**
114      * Clusters the vertices of <code>g</code> into
115      * <code>num_clusters</code> clusters, based on their connectivity.
116      * @param num_clusters  the number of clusters to identify
117      */
118     public Collection<Set<V>> cluster(int num_clusters)
119     {
120         return cluster_internal(null, num_clusters);
121     }
122
123     /**
124      * Does the work of <code>getCommunity</code> and <code>cluster</code>.
125      * @param origin the vertex around which clustering is to be done
126      * @param num_clusters the (maximum) number of clusters to find
127      */
128     protected Collection<Set<V>> cluster_internal(V origin, int num_clusters)
129     {
130         // generate candidate clusters
131         // repeat the following 'samples' times:
132         // * pick (widely separated) vertex pair, run VoltageScorer
133         // * use k-means to identify 2 communities in ranked graph
134         // * store resulting candidate communities
135         ArrayList<V> v_array = new ArrayList<V>(g.getVertices());
136
137         LinkedList<Set<V>> candidates = new LinkedList<Set<V>>();
138
139         for (int j = 0; j < num_candidates; j++)
140         {
141             V source;
142             if (origin == null)
143                 source = v_array.get((int)(rand.nextDouble() * v_array.size()));
144             else
145                 source = origin;
146             V target = null;
147             do
148             {
149                 target = v_array.get((int)(rand.nextDouble() * v_array.size()));
150             }
151             while (source == target);
152             VoltageScorer<V,E> vs = new VoltageScorer<V,E>(g, source, target);
153             vs.evaluate();
154
155             Map<V, double[]> voltage_ranks = new HashMap<V, double[]>();
156             for (V v : g.getVertices())
157                 voltage_ranks.put(v, new double[] {vs.getVertexScore(v)});
158
159 //            addOneCandidateCluster(candidates, voltage_ranks);
160             addTwoCandidateClusters(candidates, voltage_ranks);
161         }
162
163         // repeat the following k-1 times:
164         // * pick a vertex v as a cluster seed
165         //   (Wu/Huberman: most frequent vertex in candidates)
166         // * calculate co-occurrence (in candidate clusters)
167         //   of this vertex with all others
168         // * use k-means to separate co-occurrence counts into high/low;
169         //   high vertices are a cluster
170         // * remove v's vertices from candidate clusters
171
172         Collection<Set<V>> clusters = new LinkedList<Set<V>>();
173         Set<V> remaining = new HashSet<V>(g.getVertices());
174
175         List<V> seed_candidates = getSeedCandidates(candidates);
176         int seed_index = 0;
177
178         for (int j = 0; j < (num_clusters - 1); j++)
179         {
180             if (remaining.isEmpty())
181                 break;
182
183             V seed;
184             if (seed_index == 0 && origin != null)
185                 seed = origin;
186             else
187             {
188                 do { seed = seed_candidates.get(seed_index++); }
189                 while (!remaining.contains(seed));
190             }
191
192             Map<V, double[]> occur_counts = getObjectCounts(candidates, seed);
193             if (occur_counts.size() < 2)
194                 break;
195
196             // now that we have the counts, cluster them...
197             try
198             {
199                 Collection<Map<V, double[]>> high_low = kmc.cluster(occur_counts, 2);
200                 // ...get the cluster with the highest-valued centroid...
201                 Iterator<Map<V, double[]>> h_iter = high_low.iterator();
202                 Map<V, double[]> cluster1 = h_iter.next();
203                 Map<V, double[]> cluster2 = h_iter.next();
204                 double[] centroid1 = DiscreteDistribution.mean(cluster1.values());
205                 double[] centroid2 = DiscreteDistribution.mean(cluster2.values());
206                 Set<V> new_cluster;
207                 if (centroid1[0] >= centroid2[0])
208                     new_cluster = cluster1.keySet();
209                 else
210                     new_cluster = cluster2.keySet();
211
212                 // ...remove the elements of new_cluster from each candidate...
213                 for (Set<V> cluster : candidates)
214                     cluster.removeAll(new_cluster);
215                 clusters.add(new_cluster);
216                 remaining.removeAll(new_cluster);
217             }
218             catch (NotEnoughClustersException nece)
219             {
220                 // all remaining vertices are in the same cluster
221                 break;
222             }
223         }
224
225         // identify remaining vertices (if any) as a 'garbage' cluster
226         if (!remaining.isEmpty())
227             clusters.add(remaining);
228
229         return clusters;
230     }
231
232     /**
233      * Do k-means with three intervals and pick the
234      * smaller two clusters (presumed to be on the ends); this is closer to the Wu-Huberman method.
235      * @param candidates
236      * @param voltage_ranks
237      */
238     protected void addTwoCandidateClusters(LinkedList<Set<V>> candidates,
239             Map<V, double[]> voltage_ranks)
240     {
241         try
242         {
243             List<Map<V, double[]>> clusters = new ArrayList<Map<V, double[]>>(kmc.cluster(voltage_ranks, 3));
244             boolean b01 = clusters.get(0).size() > clusters.get(1).size();
245             boolean b02 = clusters.get(0).size() > clusters.get(2).size();
246             boolean b12 = clusters.get(1).size() > clusters.get(2).size();
247             if (b01 && b02)
248             {
249                 candidates.add(clusters.get(1).keySet());
250                 candidates.add(clusters.get(2).keySet());
251             }
252             else if (!b01 && b12)
253             {
254                 candidates.add(clusters.get(0).keySet());
255                 candidates.add(clusters.get(2).keySet());
256             }
257             else if (!b02 && !b12)
258             {
259                 candidates.add(clusters.get(0).keySet());
260                 candidates.add(clusters.get(1).keySet());
261             }
262         }
263         catch (NotEnoughClustersException e)
264         {
265             // no valid candidates, continue
266         }
267     }
268
269     /**
270      * alternative to addTwoCandidateClusters(): cluster vertices by voltages into 2 clusters.
271      * We only consider the smaller of the two clusters returned
272      * by k-means to be a 'true' cluster candidate; the other is a garbage cluster.
273      * @param candidates
274      * @param voltage_ranks
275      */
276     protected void addOneCandidateCluster(LinkedList<Set<V>> candidates,
277             Map<V, double[]> voltage_ranks)
278     {
279         try
280         {
281             List<Map<V, double[]>> clusters;
282             clusters = new ArrayList<Map<V, double[]>>(kmc.cluster(voltage_ranks, 2));
283             if (clusters.get(0).size() < clusters.get(1).size())
284                 candidates.add(clusters.get(0).keySet());
285             else
286                 candidates.add(clusters.get(1).keySet());
287         }
288         catch (NotEnoughClustersException e)
289         {
290             // no valid candidates, continue
291         }
292     }
293
294     /**
295      * Returns an array of cluster seeds, ranked in decreasing order
296      * of number of appearances in the specified collection of candidate
297      * clusters.
298      * @param candidates
299      */
300     protected List<V> getSeedCandidates(Collection<Set<V>> candidates)
301     {
302         final Map<V, double[]> occur_counts = getObjectCounts(candidates, null);
303
304         ArrayList<V> occurrences = new ArrayList<V>(occur_counts.keySet());
305         Collections.sort(occurrences, new MapValueArrayComparator(occur_counts));
306
307         System.out.println("occurrences: ");
308         for (int i = 0; i < occurrences.size(); i++)
309             System.out.println(occur_counts.get(occurrences.get(i))[0]);
310
311         return occurrences;
312     }
313
314     protected Map<V, double[]> getObjectCounts(Collection<Set<V>> candidates, V seed)
315     {
316         Map<V, double[]> occur_counts = new HashMap<V, double[]>();
317         for (V v : g.getVertices())
318             occur_counts.put(v, new double[]{0});
319
320         for (Set<V> candidate : candidates)
321         {
322             if (seed == null)
323                 System.out.println(candidate.size());
324             if (seed == null || candidate.contains(seed))
325             {
326                 for (V element : candidate)
327                 {
328                     double[] count = occur_counts.get(element);
329                     count[0]++;
330                 }
331             }
332         }
333
334         if (seed == null)
335         {
336             System.out.println("occur_counts size: " + occur_counts.size());
337             for (V v : occur_counts.keySet())
338                 System.out.println(occur_counts.get(v)[0]);
339         }
340
341         return occur_counts;
342     }
343
344     protected class MapValueArrayComparator implements Comparator<V>
345     {
346         private Map<V, double[]> map;
347
348         protected MapValueArrayComparator(Map<V, double[]> map)
349         {
350             this.map = map;
351         }
352
353         public int compare(V o1, V o2)
354         {
355             double[] count0 = map.get(o1);
356             double[] count1 = map.get(o2);
357             if (count0[0] < count1[0])
358                 return 1;
359             else if (count0[0] > count1[0])
360                 return -1;
361             return 0;
362         }
363
364     }
365
366 }