So I did a small project in order to understand how Apache Mahout works. I decided to use Apache Maven 2 in order to manage all dependencies so I will start with POM file first.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | <!--xml version="1.0" encoding="UTF-8"?--> < project xmlns = "http://maven.apache.org/POM/4.0.0" xmlns:xsi = "http://www.w3.org/2001/XMLSchema-instance" xsi:schemalocation = "http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd" > < modelversion >4.0.0</ modelversion > < groupid >org.acme</ groupid > < artifactid >mahout</ artifactid > < version >0.94</ version > < name >Mahout Examples</ name > < description >Scalable machine learning library examples</ description > < packaging >jar</ packaging > < properties > < project.build.sourceencoding >UTF-8</ project.build.sourceencoding > < apache.mahout.version >0.4</ apache.mahout.version > </ properties > < build > < plugins > < plugin > < groupid >org.apache.maven.plugins</ groupid > < artifactid >maven-compiler-plugin</ artifactid > < configuration > < encoding >UTF-8</ encoding > < source >1.6 < target >1.6</ target > < optimize >true</ optimize > </ configuration > </ plugin > </ plugins > </ build > < dependencies > < dependency > < groupid >org.apache.mahout</ groupid > < artifactid >mahout-core</ artifactid > < version >${apache.mahout.version}</ version > </ dependency > < dependency > < groupid >org.apache.mahout</ groupid > < artifactid >mahout-math</ artifactid > < version >${apache.mahout.version}</ version > </ dependency > < dependency > < groupid >org.apache.mahout</ groupid > < artifactid >mahout-utils</ artifactid > < version >${apache.mahout.version}</ version > </ dependency > < dependency > < groupid >org.slf4j</ groupid > < artifactid >slf4j-api</ artifactid > < version >1.6.0</ version > </ dependency > < dependency > < groupid >org.slf4j</ groupid > < artifactid >slf4j-jcl</ artifactid > < version >1.6.0</ version > </ dependency > </ dependencies > </ project > |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | package org.acme; import java.io.BufferedReader; import java.io.IOException; import java.io.FileReader; import java.util.List; import org.apache.hadoop.fs.Path; import org.apache.mahout.classifier.ClassifierResult; import org.apache.mahout.classifier.bayes.TrainClassifier; import org.apache.mahout.classifier.bayes.algorithm.BayesAlgorithm; import org.apache.mahout.classifier.bayes.common.BayesParameters; import org.apache.mahout.classifier.bayes.datastore.InMemoryBayesDatastore; import org.apache.mahout.classifier.bayes.exceptions.InvalidDatastoreException; import org.apache.mahout.classifier.bayes.interfaces.Algorithm; import org.apache.mahout.classifier.bayes.interfaces.Datastore; import org.apache.mahout.classifier.bayes.model.ClassifierContext; import org.apache.mahout.common.nlp.NGrams; public class Starter { public static void main( final String[] args ) { final BayesParameters params = new BayesParameters(); params.setGramSize( 1 ); params.set( "verbose" , "true" ); params.set( "classifierType" , "bayes" ); params.set( "defaultCat" , "OTHER" ); params.set( "encoding" , "UTF-8" ); params.set( "alpha_i" , "1.0" ); params.set( "dataSource" , "hdfs" ); params.set( "basePath" , "/tmp/output" ); try { Path input = new Path( "/tmp/input" ); TrainClassifier.trainNaiveBayes( input, "/tmp/output" , params ); Algorithm algorithm = new BayesAlgorithm(); Datastore datastore = new InMemoryBayesDatastore( params ); ClassifierContext classifier = new ClassifierContext( algorithm, datastore ); classifier.initialize(); final BufferedReader reader = new BufferedReader( new FileReader( args[ 0 ] ) ); String entry = reader.readLine(); while ( entry != null ) { List< String > document = new NGrams( entry, Integer.parseInt( params.get( "gramSize" ) ) ) .generateNGramsWithoutLabel(); ClassifierResult result = classifier.classifyDocument( document.toArray( new String[ document.size() ] ), params.get( "defaultCat" ) ); entry = reader.readLine(); } } catch ( final IOException ex ) { ex.printStackTrace(); } catch ( final InvalidDatastoreException ex ) { ex.printStackTrace(); } } } |
SUGGESTION That's a great suggestion QUESTION Do you sell Microsoft Office? ...More files you can provide, more precise classification you will get. All files must be put to the '/tmp/input' folder, they will be processed by Apache Hadoop first. :)