So I did a small project in order to understand how Apache Mahout works. I decided to use Apache Maven 2 in order to manage all dependencies so I will start with POM file first.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | <!--xml version="1.0" encoding="UTF-8"?--> < project xmlns = "http://maven.apache.org/POM/4.0.0" xmlns:xsi = "http://www.w3.org/2001/XMLSchema-instance" xsi:schemalocation = "http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd" > < modelversion >4.0.0</ modelversion > < groupid >org.acme</ groupid > < artifactid >mahout</ artifactid > < version >0.94</ version > < name >Mahout Examples</ name > < description >Scalable machine learning library examples</ description > < packaging >jar</ packaging > < properties > < project.build.sourceencoding >UTF-8</ project.build.sourceencoding > < apache.mahout.version >0.4</ apache.mahout.version > </ properties > < build > < plugins > < plugin > < groupid >org.apache.maven.plugins</ groupid > < artifactid >maven-compiler-plugin</ artifactid > < configuration > < encoding >UTF-8</ encoding > < source >1.6 < target >1.6</ target > < optimize >true</ optimize > </ configuration > </ plugin > </ plugins > </ build > < dependencies > < dependency > < groupid >org.apache.mahout</ groupid > < artifactid >mahout-core</ artifactid > < version >${apache.mahout.version}</ version > </ dependency > < dependency > < groupid >org.apache.mahout</ groupid > < artifactid >mahout-math</ artifactid > < version >${apache.mahout.version}</ version > </ dependency > < dependency > < groupid >org.apache.mahout</ groupid > < artifactid >mahout-utils</ artifactid > < version >${apache.mahout.version}</ version > </ dependency > < dependency > < groupid >org.slf4j</ groupid > < artifactid >slf4j-api</ artifactid > < version >1.6.0</ version > </ dependency > < dependency > < groupid >org.slf4j</ groupid > < artifactid >slf4j-jcl</ artifactid > < version >1.6.0</ version > </ dependency > </ dependencies > </ project > |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | package org.acme; import java.io.BufferedReader; import java.io.IOException; import java.io.FileReader; import java.util.List; import org.apache.hadoop.fs.Path; import org.apache.mahout.classifier.ClassifierResult; import org.apache.mahout.classifier.bayes.TrainClassifier; import org.apache.mahout.classifier.bayes.algorithm.BayesAlgorithm; import org.apache.mahout.classifier.bayes.common.BayesParameters; import org.apache.mahout.classifier.bayes.datastore.InMemoryBayesDatastore; import org.apache.mahout.classifier.bayes.exceptions.InvalidDatastoreException; import org.apache.mahout.classifier.bayes.interfaces.Algorithm; import org.apache.mahout.classifier.bayes.interfaces.Datastore; import org.apache.mahout.classifier.bayes.model.ClassifierContext; import org.apache.mahout.common.nlp.NGrams; public class Starter { public static void main( final String[] args ) { final BayesParameters params = new BayesParameters(); params.setGramSize( 1 ); params.set( "verbose" , "true" ); params.set( "classifierType" , "bayes" ); params.set( "defaultCat" , "OTHER" ); params.set( "encoding" , "UTF-8" ); params.set( "alpha_i" , "1.0" ); params.set( "dataSource" , "hdfs" ); params.set( "basePath" , "/tmp/output" ); try { Path input = new Path( "/tmp/input" ); TrainClassifier.trainNaiveBayes( input, "/tmp/output" , params ); Algorithm algorithm = new BayesAlgorithm(); Datastore datastore = new InMemoryBayesDatastore( params ); ClassifierContext classifier = new ClassifierContext( algorithm, datastore ); classifier.initialize(); final BufferedReader reader = new BufferedReader( new FileReader( args[ 0 ] ) ); String entry = reader.readLine(); while ( entry != null ) { List< String > document = new NGrams( entry, Integer.parseInt( params.get( "gramSize" ) ) ) .generateNGramsWithoutLabel(); ClassifierResult result = classifier.classifyDocument( document.toArray( new String[ document.size() ] ), params.get( "defaultCat" ) ); entry = reader.readLine(); } } catch ( final IOException ex ) { ex.printStackTrace(); } catch ( final InvalidDatastoreException ex ) { ex.printStackTrace(); } } } |
SUGGESTION That's a great suggestion QUESTION Do you sell Microsoft Office? ...More files you can provide, more precise classification you will get. All files must be put to the '/tmp/input' folder, they will be processed by Apache Hadoop first. :)
4 comments:
Hi.. Interesting experiment.. but what is args[0] in ur code?
Also how is it to be run? The dependencies are installed using mvn install..
hola,
tienes algun ejemplo utilizando apache mahout muchas gracias
por favor un ejemplo sobre como utilizar mahout
Post a Comment