基于模糊K-Means算法的新闻聚类

来自： http://blog.csdn.net/u012965373/article/details/50754449

<strong>/***   * @author YangXin   * @info 基于模糊K-Means算法的新闻聚类   */  package unitNine;  import org.apache.hadoop.conf.Configuration;  import org.apache.hadoop.fs.FileSystem;  import org.apache.hadoop.fs.Path;  import org.apache.hadoop.io.IntWritable;  import org.apache.hadoop.io.SequenceFile;  import org.apache.lucene.analysis.Analyzer;  import org.apache.mahout.clustering.Cluster;  import org.apache.mahout.clustering.classify.WeightedVectorWritable;  import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;  import org.apache.mahout.common.HadoopUtil;  import org.apache.mahout.common.distance.ManhattanDistanceMeasure;  import org.apache.mahout.common.distance.TanimotoDistanceMeasure;  import org.apache.mahout.vectorizer.DictionaryVectorizer;  import org.apache.mahout.vectorizer.DocumentProcessor;  import org.apache.mahout.vectorizer.tfidf.TFIDFConverter;  public class NewsFuzzyKMeansClustering {   public static void main(String args[]) throws Exception {              int minSupport = 5;       int minDf = 10;       int maxDFPercent = 70;       int maxNGramSize = 1;       int minLLRValue = 200;       int reduceTasks = 1;       int chunkSize = 200;       int norm = 2;       boolean sequentialAccessOutput = true;              String inputDir = "inputDir";              Configuration conf = new Configuration();       FileSystem fs = FileSystem.get(conf);       /*       SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf,           new Path(inputDir, "documents.seq"), Text.class, Text.class);       for (Document d : Database) {         writer.append(new Text(d.getID()), new Text(d.contents()));       }       writer.close();*/               String outputDir = "newsClusters";       HadoopUtil.delete(conf, new Path(outputDir));              Path tokenizedPath = new Path(outputDir,           DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);       MyAnalyzer analyzer = new MyAnalyzer();       DocumentProcessor.tokenizeDocuments(new Path(inputDir), analyzer.getClass()           .asSubclass(Analyzer.class), tokenizedPath, conf);              DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,         new Path(outputDir), conf, minSupport, maxNGramSize, minLLRValue, 2, true, reduceTasks,         chunkSize, sequentialAccessOutput, false);       TFIDFConverter.processTfIdf(         new Path(outputDir , DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),         new Path(outputDir), conf, chunkSize, minDf,         maxDFPercent, norm, true, sequentialAccessOutput, false, reduceTasks);       String vectorsFolder = outputDir + "/tfidf-vectors";       String canopyCentroids = outputDir + "/canopy-centroids";       String clusterOutput = outputDir + "/clusters/";              CanopyDriver.run(conf, new Path(vectorsFolder), new Path(canopyCentroids),         new ManhattanDistanceMeasure(), 3000.0, 2000.0, false, false);              FuzzyKMeansDriver.run(conf, new Path(vectorsFolder), new Path(canopyCentroids, "clusters-0"), new Path(clusterOutput),         new TanimotoDistanceMeasure(), 0.01, 20, 2.0f, true, true, 0.0, false);              SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(         clusterOutput + Cluster.CLUSTERED_POINTS_DIR +"/part-m-00000"), conf);              IntWritable key = new IntWritable();       WeightedVectorWritable value = new WeightedVectorWritable();       while (reader.next(key, value)) {         System.out.println("Cluster: " + key.toString() + " "                            + value.getVector().asFormatString());       }       reader.close();     }  }  </strong>

本文由用户 MamieH92 自行上传分享，仅供网友学习交流。所有权归原作者，若您的权利被侵害，请联系管理员。

转载本站原创文章，请注明出处，并保留原始链接、图片水印。

本站是一个以用户分享为主的开源技术平台，欢迎各类分享！

本文地址：https://www.open-open.com/lib/view/open1456659643015.html

算法

热门搜索

基于模糊K-Means算法的新闻聚类