third-party/net.sf.jung2/src/main/java/edu/uci/ics/jung/algorithms/cluster/VoltageClusterer.java

   1 /*
   2  * Copyright (c) 2004, the JUNG Project and the Regents of the University
   3  * of California
   4  * All rights reserved.
   5  *
   6  * This software is open-source under the BSD license; see either
   7  * "license.txt" or
   8  * http://jung.sourceforge.net/license.txt for a description.
   9  *
  10  * Created on Aug 12, 2004
  11  */
  12 package edu.uci.ics.jung.algorithms.cluster;
  13
  14 import edu.uci.ics.jung.algorithms.scoring.VoltageScorer;
  15 import edu.uci.ics.jung.algorithms.util.DiscreteDistribution;
  16 import edu.uci.ics.jung.algorithms.util.KMeansClusterer;
  17 import edu.uci.ics.jung.algorithms.util.KMeansClusterer.NotEnoughClustersException;
  18 import edu.uci.ics.jung.graph.Graph;
  19
  20 import java.util.ArrayList;
  21 import java.util.Collection;
  22 import java.util.Collections;
  23 import java.util.Comparator;
  24 import java.util.HashMap;
  25 import java.util.HashSet;
  26 import java.util.Iterator;
  27 import java.util.LinkedList;
  28 import java.util.List;
  29 import java.util.Map;
  30 import java.util.Random;
  31 import java.util.Set;
  32
  33 /**
  34  * <p>Clusters vertices of a <code>Graph</code> based on their ranks as
  35  * calculated by <code>VoltageScorer</code>.  This algorithm is based on,
  36  * but not identical with, the method described in the paper below.
  37  * The primary difference is that Wu and Huberman assume a priori that the clusters
  38  * are of approximately the same size, and therefore use a more complex
  39  * method than k-means (which is used here) for determining cluster
  40  * membership based on co-occurrence data.</p>
  41  *
  42  * <p>The algorithm proceeds as follows:
  43  * <ul>
  44  * <li/>first, generate a set of candidate clusters as follows:
  45  *      <ul>
  46  *      <li/>pick (widely separated) vertex pair, run VoltageScorer
  47  *      <li/>group the vertices in two clusters according to their voltages
  48  *      <li/>store resulting candidate clusters
  49  *      </ul>
  50  * <li/>second, generate k-1 clusters as follows:
  51  *      <ul>
  52  *      <li/>pick a vertex v as a cluster 'seed'
  53  *           <br>(Wu/Huberman: most frequent vertex in candidate clusters)
  54  *      <li/>calculate co-occurrence over all candidate clusters of v with each other
  55  *           vertex
  56  *      <li/>separate co-occurrence counts into high/low;
  57  *           high vertices constitute a cluster
  58  *      <li/>remove v's vertices from candidate clusters; continue
  59  *      </ul>
  60  * <li/>finally, remaining unassigned vertices are assigned to the kth ("garbage")
  61  * cluster.
  62  * </ul></p>
  63  *
  64  * <p><b>NOTE</b>: Depending on how the co-occurrence data splits the data into
  65  * clusters, the number of clusters returned by this algorithm may be less than the
  66  * number of clusters requested.  The number of clusters will never be more than
  67  * the number requested, however.</p>
  68  *
  69  * @author Joshua O'Madadhain
  70  * @see "'Finding communities in linear time: a physics approach', Fang Wu and Bernardo Huberman, http://www.hpl.hp.com/research/idl/papers/linear/"
  71  * @see VoltageScorer
  72  * @see KMeansClusterer
  73  */
  74 public class VoltageClusterer<V,E>
  75 {
  76     protected int num_candidates;
  77     protected KMeansClusterer<V> kmc;
  78     protected Random rand;
  79     protected Graph<V,E> g;
  80
  81     /**
  82      * Creates an instance of a VoltageCluster with the specified parameters.
  83      * These are mostly parameters that are passed directly to VoltageScorer
  84      * and KMeansClusterer.
  85      *
  86      * @param num_candidates    the number of candidate clusters to create
  87      */
  88     public VoltageClusterer(Graph<V,E> g, int num_candidates)
  89     {
  90         if (num_candidates < 1)
  91             throw new IllegalArgumentException("must generate >=1 candidates");
  92
  93         this.num_candidates = num_candidates;
  94         this.kmc = new KMeansClusterer<V>();
  95         rand = new Random();
  96         this.g = g;
  97     }
  98
  99     protected void setRandomSeed(int random_seed)
 100     {
 101         rand = new Random(random_seed);
 102     }
 103
 104     /**
 105      * Returns a community (cluster) centered around <code>v</code>.
 106      * @param v the vertex whose community we wish to discover
 107      */
 108     public Collection<Set<V>> getCommunity(V v)
 109     {
 110         return cluster_internal(v, 2);
 111     }
 112
 113     /**
 114      * Clusters the vertices of <code>g</code> into
 115      * <code>num_clusters</code> clusters, based on their connectivity.
 116      * @param num_clusters  the number of clusters to identify
 117      */
 118     public Collection<Set<V>> cluster(int num_clusters)
 119     {
 120         return cluster_internal(null, num_clusters);
 121     }
 122
 123     /**
 124      * Does the work of <code>getCommunity</code> and <code>cluster</code>.
 125      * @param origin the vertex around which clustering is to be done
 126      * @param num_clusters the (maximum) number of clusters to find
 127      */
 128     protected Collection<Set<V>> cluster_internal(V origin, int num_clusters)
 129     {
 130         // generate candidate clusters
 131         // repeat the following 'samples' times:
 132         // * pick (widely separated) vertex pair, run VoltageScorer
 133         // * use k-means to identify 2 communities in ranked graph
 134         // * store resulting candidate communities
 135         ArrayList<V> v_array = new ArrayList<V>(g.getVertices());
 136
 137         LinkedList<Set<V>> candidates = new LinkedList<Set<V>>();
 138
 139         for (int j = 0; j < num_candidates; j++)
 140         {
 141             V source;
 142             if (origin == null)
 143                 source = v_array.get((int)(rand.nextDouble() * v_array.size()));
 144             else
 145                 source = origin;
 146             V target = null;
 147             do
 148             {
 149                 target = v_array.get((int)(rand.nextDouble() * v_array.size()));
 150             }
 151             while (source == target);
 152             VoltageScorer<V,E> vs = new VoltageScorer<V,E>(g, source, target);
 153             vs.evaluate();
 154
 155             Map<V, double[]> voltage_ranks = new HashMap<V, double[]>();
 156             for (V v : g.getVertices())
 157                 voltage_ranks.put(v, new double[] {vs.getVertexScore(v)});
 158
 159 //            addOneCandidateCluster(candidates, voltage_ranks);
 160             addTwoCandidateClusters(candidates, voltage_ranks);
 161         }
 162
 163         // repeat the following k-1 times:
 164         // * pick a vertex v as a cluster seed
 165         //   (Wu/Huberman: most frequent vertex in candidates)
 166         // * calculate co-occurrence (in candidate clusters)
 167         //   of this vertex with all others
 168         // * use k-means to separate co-occurrence counts into high/low;
 169         //   high vertices are a cluster
 170         // * remove v's vertices from candidate clusters
 171
 172         Collection<Set<V>> clusters = new LinkedList<Set<V>>();
 173         Set<V> remaining = new HashSet<V>(g.getVertices());
 174
 175         List<V> seed_candidates = getSeedCandidates(candidates);
 176         int seed_index = 0;
 177
 178         for (int j = 0; j < (num_clusters - 1); j++)
 179         {
 180             if (remaining.isEmpty())
 181                 break;
 182
 183             V seed;
 184             if (seed_index == 0 && origin != null)
 185                 seed = origin;
 186             else
 187             {
 188                 do { seed = seed_candidates.get(seed_index++); }
 189                 while (!remaining.contains(seed));
 190             }
 191
 192             Map<V, double[]> occur_counts = getObjectCounts(candidates, seed);
 193             if (occur_counts.size() < 2)
 194                 break;
 195
 196             // now that we have the counts, cluster them...
 197             try
 198             {
 199                 Collection<Map<V, double[]>> high_low = kmc.cluster(occur_counts, 2);
 200                 // ...get the cluster with the highest-valued centroid...
 201                 Iterator<Map<V, double[]>> h_iter = high_low.iterator();
 202                 Map<V, double[]> cluster1 = h_iter.next();
 203                 Map<V, double[]> cluster2 = h_iter.next();
 204                 double[] centroid1 = DiscreteDistribution.mean(cluster1.values());
 205                 double[] centroid2 = DiscreteDistribution.mean(cluster2.values());
 206                 Set<V> new_cluster;
 207                 if (centroid1[0] >= centroid2[0])
 208                     new_cluster = cluster1.keySet();
 209                 else
 210                     new_cluster = cluster2.keySet();
 211
 212                 // ...remove the elements of new_cluster from each candidate...
 213                 for (Set<V> cluster : candidates)
 214                     cluster.removeAll(new_cluster);
 215                 clusters.add(new_cluster);
 216                 remaining.removeAll(new_cluster);
 217             }
 218             catch (NotEnoughClustersException nece)
 219             {
 220                 // all remaining vertices are in the same cluster
 221                 break;
 222             }
 223         }
 224
 225         // identify remaining vertices (if any) as a 'garbage' cluster
 226         if (!remaining.isEmpty())
 227             clusters.add(remaining);
 228
 229         return clusters;
 230     }
 231
 232     /**
 233      * Do k-means with three intervals and pick the
 234      * smaller two clusters (presumed to be on the ends); this is closer to the Wu-Huberman method.
 235      * @param candidates
 236      * @param voltage_ranks
 237      */
 238     protected void addTwoCandidateClusters(LinkedList<Set<V>> candidates,
 239             Map<V, double[]> voltage_ranks)
 240     {
 241         try
 242         {
 243             List<Map<V, double[]>> clusters = new ArrayList<Map<V, double[]>>(kmc.cluster(voltage_ranks, 3));
 244             boolean b01 = clusters.get(0).size() > clusters.get(1).size();
 245             boolean b02 = clusters.get(0).size() > clusters.get(2).size();
 246             boolean b12 = clusters.get(1).size() > clusters.get(2).size();
 247             if (b01 && b02)
 248             {
 249                 candidates.add(clusters.get(1).keySet());
 250                 candidates.add(clusters.get(2).keySet());
 251             }
 252             else if (!b01 && b12)
 253             {
 254                 candidates.add(clusters.get(0).keySet());
 255                 candidates.add(clusters.get(2).keySet());
 256             }
 257             else if (!b02 && !b12)
 258             {
 259                 candidates.add(clusters.get(0).keySet());
 260                 candidates.add(clusters.get(1).keySet());
 261             }
 262         }
 263         catch (NotEnoughClustersException e)
 264         {
 265             // no valid candidates, continue
 266         }
 267     }
 268
 269     /**
 270      * alternative to addTwoCandidateClusters(): cluster vertices by voltages into 2 clusters.
 271      * We only consider the smaller of the two clusters returned
 272      * by k-means to be a 'true' cluster candidate; the other is a garbage cluster.
 273      * @param candidates
 274      * @param voltage_ranks
 275      */
 276     protected void addOneCandidateCluster(LinkedList<Set<V>> candidates,
 277             Map<V, double[]> voltage_ranks)
 278     {
 279         try
 280         {
 281             List<Map<V, double[]>> clusters;
 282             clusters = new ArrayList<Map<V, double[]>>(kmc.cluster(voltage_ranks, 2));
 283             if (clusters.get(0).size() < clusters.get(1).size())
 284                 candidates.add(clusters.get(0).keySet());
 285             else
 286                 candidates.add(clusters.get(1).keySet());
 287         }
 288         catch (NotEnoughClustersException e)
 289         {
 290             // no valid candidates, continue
 291         }
 292     }
 293
 294     /**
 295      * Returns an array of cluster seeds, ranked in decreasing order
 296      * of number of appearances in the specified collection of candidate
 297      * clusters.
 298      * @param candidates
 299      */
 300     protected List<V> getSeedCandidates(Collection<Set<V>> candidates)
 301     {
 302         final Map<V, double[]> occur_counts = getObjectCounts(candidates, null);
 303
 304         ArrayList<V> occurrences = new ArrayList<V>(occur_counts.keySet());
 305         Collections.sort(occurrences, new MapValueArrayComparator(occur_counts));
 306
 307         System.out.println("occurrences: ");
 308         for (int i = 0; i < occurrences.size(); i++)
 309             System.out.println(occur_counts.get(occurrences.get(i))[0]);
 310
 311         return occurrences;
 312     }
 313
 314     protected Map<V, double[]> getObjectCounts(Collection<Set<V>> candidates, V seed)
 315     {
 316         Map<V, double[]> occur_counts = new HashMap<V, double[]>();
 317         for (V v : g.getVertices())
 318             occur_counts.put(v, new double[]{0});
 319
 320         for (Set<V> candidate : candidates)
 321         {
 322             if (seed == null)
 323                 System.out.println(candidate.size());
 324             if (seed == null || candidate.contains(seed))
 325             {
 326                 for (V element : candidate)
 327                 {
 328                     double[] count = occur_counts.get(element);
 329                     count[0]++;
 330                 }
 331             }
 332         }
 333
 334         if (seed == null)
 335         {
 336             System.out.println("occur_counts size: " + occur_counts.size());
 337             for (V v : occur_counts.keySet())
 338                 System.out.println(occur_counts.get(v)[0]);
 339         }
 340
 341         return occur_counts;
 342     }
 343
 344     protected class MapValueArrayComparator implements Comparator<V>
 345     {
 346         private Map<V, double[]> map;
 347
 348         protected MapValueArrayComparator(Map<V, double[]> map)
 349         {
 350             this.map = map;
 351         }
 352
 353         public int compare(V o1, V o2)
 354         {
 355             double[] count0 = map.get(o1);
 356             double[] count1 = map.get(o2);
 357             if (count0[0] < count1[0])
 358                 return 1;
 359             else if (count0[0] > count1[0])
 360                 return -1;
 361             return 0;
 362         }
 363
 364     }
 365
 366 }