1 /** 2 * Copyright (c) 2010 Yahoo! Inc. All rights reserved. 3 * 4 * http://www.apache.org/licenses/LICENSE-2.0 5 * 6 * Unless required by applicable law or agreed to in writing, software 7 * distributed under the License is distributed on an "AS IS" BASIS, 8 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 * See the License for the specific language governing permissions and 10 * limitations under the License. 11 */ 12 package org.apache.omid.benchmarks.utils; 13 14 import java.util.Random; 15 16 /** 17 * A generator of a zipfian distribution. It produces a sequence of items, such that some items are more popular than others, according 18 * to a zipfian distribution. When you construct an instance of this class, you specify the number of items in the set to draw from, either 19 * by specifying an itemcount (so that the sequence is of items from 0 to itemcount-1) or by specifying a min and a max (so that the sequence is of 20 * items from min to max inclusive). After you construct the instance, you can change the number of items by calling nextInt(itemcount) or nextLong(itemcount). 21 * 22 * Note that the popular items will be clustered together, e.g. item 0 is the most popular, item 1 the second most popular, and so on (or min is the most 23 * popular, min+1 the next most popular, etc.) If you don't want this clustering, and instead want the popular items scattered throughout the 24 * item space, then use ScrambledZipfianGenerator instead. 25 * 26 * Be aware: initializing this generator may take a long time if there are lots of items to choose from (e.g. over a minute 27 * for 100 million objects). This is because certain mathematical values need to be computed to properly generate a zipfian skew, and one of those 28 * values (zeta) is a sum sequence from 1 to n, where n is the itemcount. Note that if you increase the number of items in the set, we can compute 29 * a new zeta incrementally, so it should be fast unless you have added millions of items. However, if you decrease the number of items, we recompute 30 * zeta from scratch, so this can take a long time. 31 * 32 * The algorithm used here is from "Quickly Generating Billion-Record Synthetic Databases", Jim Gray et al, SIGMOD 1994. 33 */ 34 public class ZipfianGenerator extends IntegerGenerator { 35 public static final double ZIPFIAN_CONSTANT = 0.99; 36 37 /** 38 * Number of items. 39 */ 40 long items; 41 42 /** 43 * Min item to generate. 44 */ 45 long base; 46 47 /** 48 * The zipfian constant to use. 49 */ 50 double zipfianconstant; 51 52 /** 53 * Computed parameters for generating the distribution. 54 */ 55 double alpha, zetan, eta, theta, zeta2theta; 56 57 /** 58 * The number of items used to compute zetan the last time. 59 */ 60 long countforzeta; 61 62 /** 63 * Flag to prevent problems. If you increase the number of items the zipfian generator is allowed to choose from, this code will incrementally compute a new zeta 64 * value for the larger itemcount. However, if you decrease the number of items, the code computes zeta from scratch; this is expensive for large itemsets. 65 * Usually this is not intentional; e.g. one thread thinks the number of items is 1001 and calls "nextLong()" with that item count; then another thread who thinks the 66 * number of items is 1000 calls nextLong() with itemcount=1000 triggering the expensive recomputation. (It is expensive for 100 million items, not really for 1000 items.) Why 67 * did the second thread think there were only 1000 items? maybe it read the item count before the first thread incremented it. So this flag allows you to say if you really do 68 * want that recomputation. If true, then the code will recompute zeta if the itemcount goes down. If false, the code will assume itemcount only goes up, and never recompute. 69 */ 70 boolean allowitemcountdecrease = false; 71 72 Random random = new Random(); 73 74 /******************************* Constructors **************************************/ 75 76 /** 77 * Create a zipfian generator for the specified number of items. 78 * @param _items The number of items in the distribution. 79 */ 80 public ZipfianGenerator(long _items) { 81 this(0, _items - 1); 82 } 83 84 /** 85 * Create a zipfian generator for items between min and max. 86 * @param _min The smallest integer to generate in the sequence. 87 * @param _max The largest integer to generate in the sequence. 88 */ 89 public ZipfianGenerator(long _min, long _max) { 90 this(_min, _max, ZIPFIAN_CONSTANT); 91 } 92 93 /** 94 * Create a zipfian generator for the specified number of items using the specified zipfian constant. 95 * 96 * @param _items The number of items in the distribution. 97 * @param _zipfianconstant The zipfian constant to use. 98 */ 99 public ZipfianGenerator(long _items, double _zipfianconstant) { 100 this(0, _items - 1, _zipfianconstant); 101 } 102 103 /** 104 * Create a zipfian generator for items between min and max (inclusive) for the specified zipfian constant. 105 * @param min The smallest integer to generate in the sequence. 106 * @param max The largest integer to generate in the sequence. 107 * @param _zipfianconstant The zipfian constant to use. 108 */ 109 public ZipfianGenerator(long min, long max, double _zipfianconstant) { 110 this(min, max, _zipfianconstant, zetastatic(max - min + 1, _zipfianconstant)); 111 } 112 113 /** 114 * Create a zipfian generator for items between min and max (inclusive) for the specified zipfian constant, using the precomputed value of zeta. 115 * 116 * @param min The smallest integer to generate in the sequence. 117 * @param max The largest integer to generate in the sequence. 118 * @param _zipfianconstant The zipfian constant to use. 119 * @param _zetan The precomputed zeta constant. 120 */ 121 public ZipfianGenerator(long min, long max, double _zipfianconstant, double _zetan) { 122 123 items = max - min + 1; 124 base = min; 125 zipfianconstant = _zipfianconstant; 126 127 theta = zipfianconstant; 128 129 zeta2theta = zeta(2, theta); 130 131 132 alpha = 1.0 / (1.0 - theta); 133 //zetan=zeta(items,theta); 134 zetan = _zetan; 135 countforzeta = items; 136 eta = (1 - Math.pow(2.0 / items, 1 - theta)) / (1 - zeta2theta / zetan); 137 138 //System.out.println("XXXX 3 XXXX"); 139 nextInt(); 140 //System.out.println("XXXX 4 XXXX"); 141 } 142 143 /**************************************************************************/ 144 145 /** 146 * Compute the zeta constant needed for the distribution. Do this from scratch for a distribution with n items, using the 147 * zipfian constant theta. Remember the value of n, so if we change the itemcount, we can recompute zeta. 148 * 149 * @param n The number of items to compute zeta over. 150 * @param theta The zipfian constant. 151 */ 152 double zeta(long n, double theta) { 153 countforzeta = n; 154 return zetastatic(n, theta); 155 } 156 157 /** 158 * Compute the zeta constant needed for the distribution. Do this from scratch for a distribution with n items, using the 159 * zipfian constant theta. This is a static version of the function which will not remember n. 160 * @param n The number of items to compute zeta over. 161 * @param theta The zipfian constant. 162 */ 163 static double zetastatic(long n, double theta) { 164 return zetastatic(0, n, theta, 0); 165 } 166 167 /** 168 * Compute the zeta constant needed for the distribution. Do this incrementally for a distribution that 169 * has n items now but used to have st items. Use the zipfian constant theta. Remember the new value of 170 * n so that if we change the itemcount, we'll know to recompute zeta. 171 * 172 * @param st The number of items used to compute the last initialsum 173 * @param n The number of items to compute zeta over. 174 * @param theta The zipfian constant. 175 * @param initialsum The value of zeta we are computing incrementally from. 176 */ 177 double zeta(long st, long n, double theta, double initialsum) { 178 countforzeta = n; 179 return zetastatic(st, n, theta, initialsum); 180 } 181 182 /** 183 * Compute the zeta constant needed for the distribution. Do this incrementally for a distribution that 184 * has n items now but used to have st items. Use the zipfian constant theta. Remember the new value of 185 * n so that if we change the itemcount, we'll know to recompute zeta. 186 * @param st The number of items used to compute the last initialsum 187 * @param n The number of items to compute zeta over. 188 * @param theta The zipfian constant. 189 * @param initialsum The value of zeta we are computing incrementally from. 190 */ 191 static double zetastatic(long st, long n, double theta, double initialsum) { 192 double sum = initialsum; 193 for (long i = st; i < n; i++) { 194 195 sum += 1 / (Math.pow(i + 1, theta)); 196 } 197 198 //System.out.println("countforzeta="+countforzeta); 199 200 return sum; 201 } 202 203 /****************************************************************************************/ 204 205 /** 206 * Generate the next item. this distribution will be skewed toward lower integers; e.g. 0 will 207 * be the most popular, 1 the next most popular, etc. 208 * @param itemcount The number of items in the distribution. 209 * @return The next item in the sequence. 210 */ 211 public int nextInt(int itemcount) { 212 return (int) nextLong(itemcount); 213 } 214 215 /** 216 * Generate the next item as a long. 217 * 218 * @param itemcount The number of items in the distribution. 219 * @return The next item in the sequence. 220 */ 221 public long nextLong(long itemcount) { 222 //from "Quickly Generating Billion-Record Synthetic Databases", Jim Gray et al, SIGMOD 1994 223 224 if (itemcount != countforzeta) { 225 226 //have to recompute zetan and eta, since they depend on itemcount 227 synchronized (this) { 228 if (itemcount > countforzeta) { 229 //System.err.println("WARNING: Incrementally recomputing Zipfian distribtion. (itemcount="+itemcount+" countforzeta="+countforzeta+")"); 230 231 //we have added more items. can compute zetan incrementally, which is cheaper 232 zetan = zeta(countforzeta, itemcount, theta, zetan); 233 eta = (1 - Math.pow(2.0 / items, 1 - theta)) / (1 - zeta2theta / zetan); 234 } else if ((itemcount < countforzeta) && (allowitemcountdecrease)) { 235 //have to start over with zetan 236 //note : for large itemsets, this is very slow. so don't do it! 237 238 //TODO: can also have a negative incremental computation, e.g. if you decrease the number of items, then just subtract 239 //the zeta sequence terms for the items that went away. This would be faster than recomputing from scratch when the number of items 240 //decreases 241 242 System.err.println("WARNING: Recomputing Zipfian distribtion. This is slow and should be avoided. (itemcount=" + itemcount + " countforzeta=" + countforzeta + ")"); 243 244 zetan = zeta(itemcount, theta); 245 eta = (1 - Math.pow(2.0 / items, 1 - theta)) / (1 - zeta2theta / zetan); 246 } 247 } 248 } 249 250 double u = random.nextDouble(); 251 double uz = u * zetan; 252 253 if (uz < 1.0) { 254 return 0; 255 } 256 257 if (uz < 1.0 + Math.pow(0.5, theta)) { 258 return 1; 259 } 260 261 long ret = base + (long) ((itemcount) * Math.pow(eta * u - eta + 1, alpha)); 262 setLastInt((int) ret); 263 return ret; 264 } 265 266 /** 267 * Return the next value, skewed by the Zipfian distribution. The 0th item will be the most popular, followed by the 1st, followed 268 * by the 2nd, etc. (Or, if min != 0, the min-th item is the most popular, the min+1th item the next most popular, etc.) If you want the 269 * popular items scattered throughout the item space, use ScrambledZipfianGenerator instead. 270 */ 271 @Override 272 public int nextInt() { 273 return (int) nextLong(items); 274 } 275 276 /** 277 * @return the next value, skewed by the Zipfian distribution. The 0th item will be the most popular, followed by the 1st, followed 278 * by the 2nd, etc. (Or, if min != 0, the min-th item is the most popular, the min+1th item the next most popular, etc.) If you want the 279 * popular items scattered throughout the item space, use ScrambledZipfianGenerator instead. 280 */ 281 public long nextLong() { 282 return nextLong(items); 283 } 284 285 //TODO Implement ZipfianGenerator.mean() 286 @Override 287 public double mean() { 288 throw new UnsupportedOperationException("@todo implement ZipfianGenerator.mean()"); 289 } 290 }