| 注册
请输入搜索内容

热门搜索

Java Linux MySQL PHP JavaScript Hibernate jQuery Nginx
htae2565
8年前发布

使用canopy生成和k-means聚类对新闻进行聚类

来自: http://blog.csdn.net/u012965373/article/details/50754420


/****   * @author YangXin   * @info 使用canopy生成和k-means聚类对新闻进行聚类   */  package unitNine;  import org.apache.hadoop.conf.Configuration;  import org.apache.hadoop.fs.FileSystem;  import org.apache.hadoop.fs.Path;  import org.apache.hadoop.io.SequenceFile;  import org.apache.hadoop.io.Text;  import org.apache.lucene.analysis.Analyzer;  import org.apache.mahout.common.HadoopUtil;  import org.apache.mahout.math.VectorWritable;  import org.apache.mahout.vectorizer.DictionaryVectorizer;  import org.apache.mahout.vectorizer.DocumentProcessor;  import org.apache.mahout.vectorizer.tfidf.TFIDFConverter;  public class ReutersToSparseVectors {     public static void main(String args[]) throws Exception {                int minSupport = 5;        int minDf = 5;        int maxDFPercent = 95;        int maxNGramSize = 1;        float minLLRValue = 50;        int reduceTasks = 1;        int chunkSize = 200;        int norm = 2;        boolean sequentialAccessOutput = true;                String inputDir = "inputDir";          Configuration conf = new Configuration();        FileSystem fs = FileSystem.get(conf);          String outputDir = "reuters";        HadoopUtil.delete(conf, new Path(outputDir));        Path tokenizedPath = new Path(outputDir,            DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);        MyAnalyzer analyzer = new MyAnalyzer();        DocumentProcessor.tokenizeDocuments(new Path(inputDir), analyzer.getClass()            .asSubclass(Analyzer.class), tokenizedPath, conf);                DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,          new Path(outputDir), conf, minSupport, maxNGramSize, minLLRValue, 2, true, reduceTasks,          chunkSize, sequentialAccessOutput, false);        TFIDFConverter.processTfIdf(          new Path(outputDir , DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),          new Path(outputDir), conf, chunkSize, minDf,          maxDFPercent, norm, true, sequentialAccessOutput, false, reduceTasks);                String vectorsFolder = outputDir + "/tfidf-vectors";                SequenceFile.Reader reader = new SequenceFile.Reader(fs,            new Path(vectorsFolder, "part-r-00000"), conf);             Text key = new Text();        VectorWritable value = new VectorWritable();        while (reader.next(key, value)) {          System.out.println(key.toString() + " = > "                             + value.get().asFormatString());        }        reader.close();      }  }

 本文由用户 htae2565 自行上传分享,仅供网友学习交流。所有权归原作者,若您的权利被侵害,请联系管理员。
 转载本站原创文章,请注明出处,并保留原始链接、图片水印。
 本站是一个以用户分享为主的开源技术平台,欢迎各类分享!
 本文地址:https://www.open-open.com/lib/view/open1456659651687.html
算法