Welcome

首页 / 软件开发 / 数据结构与算法 / mahout系列:minhash聚类

mahout系列:minhash聚类2014-04-24 csdn博客 yueyedeai
Map:Vector featureVector = features.get();if (featureVector.size() < minVectorSize) {return;}// Initialize the MinHash values to highestfor (int i = 0; i < numHashFunctions; i++) {minHashValues[i] = Integer.MAX_VALUE;}for (int i = 0; i < numHashFunctions; i++) {for (Vector.Element ele : featureVector.nonZeroes()) {int value = hashValue ? (int) ele.get() : ele.index();bytesToHash[0] = (byte) (value >> 24);bytesToHash[1] = (byte) (value >> 16);bytesToHash[2] = (byte) (value >> 8);bytesToHash[3] = (byte) value;int hashIndex = hashFunction[i].hash(bytesToHash);//if our new hash value is less than the old one, replace the old oneif (minHashValues[i] > hashIndex) {minHashValues[i] = hashIndex;}}}// output the cluster informationfor (int i = 0; i < numHashFunctions; i++) {StringBuilder clusterIdBuilder = new StringBuilder();for (int j = 0; j < keyGroups; j++) {clusterIdBuilder.append(minHashValues[(i + j) % numHashFunctions]).append("-");}//remove the last dashclusterIdBuilder.deleteCharAt(clusterIdBuilder.length() - 1);cluster.set(clusterIdBuilder.toString());if (debugOutput) {vector.set(featureVector);context.write(cluster, vector);} else {context.write(cluster, item);}}
protected void reduce(Text cluster, Iterable<Writable> points, Context context) throws IOException, InterruptedException { Collection<Writable> pointList = Lists.newArrayList(); for (Writable point : points) { if (debugOutput) { Vector pointVector = ((VectorWritable) point).get().clone(); Writable writablePointVector = new VectorWritable(pointVector); pointList.add(writablePointVector); } else { Writable pointText = new Text(point.toString()); pointList.add(pointText); } } if (pointList.size() >= minClusterSize) { context.getCounter(Clusters.ACCEPTED).increment(1); for (Writable point : pointList) { context.write(cluster, point); } } else { context.getCounter(Clusters.DISCARDED).increment(1); } }