Hi there
our project is implement KMeans algorithm on hadoop,we install hadoop on windows and run WordCount Code with hadoon and also eclipse but have problem with giving input to KMeans algorithm.
could you help us?
this is our KMeans Code :
Java
import java.io.IOException;
import java.util.*;
import java.io.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapred.Reducer;
@SuppressWarnings("deprecation")
publicclass KMeans {
publicstaticString OUT = "outfile";
publicstaticString IN = "inputlarger";
publicstaticString CENTROID_FILE_NAME = "/centroid.txt";
publicstaticString OUTPUT_FILE_NAME = "/part-00000";
publicstaticString DATA_FILE_NAME = "/data.txt";
publicstaticString JOB_NAME = "KMeans";
publicstaticString SPLITTER = "\t| ";
publicstatic List<Double> mCenters = new ArrayList<Double>();
/*
* In Mapper class we are overriding configure function. In this we are
* reading file from Distributed Cache and then storing that into instance
* variable "mCenters"
*/publicstaticclass Map extends MapReduceBase implements
Mapper<LongWritable, Text, DoubleWritable, DoubleWritable> {
@Overridepublicvoid configure(JobConf job) {
try {
// Fetch the file from Distributed Cache Read it and store the// centroid in the ArrayList
Path[] cacheFiles = DistributedCache.getLocalCacheFiles(job);
if (cacheFiles != null && cacheFiles.length > 0) {
String line;
mCenters.clear();
BufferedReader cacheReader = new BufferedReader(
new FileReader(cacheFiles[0].toString()));
try {
// Read the file split by the splitter and store it in// the listwhile ((line = cacheReader.readLine()) != null) {
String[] temp = line.split(SPLITTER);
mCenters.add(Double.parseDouble(temp[0]));
}
} finally {
cacheReader.close();
}
}
} catch (IOException e) {
System.err.println("Exception reading DistribtuedCache: " + e);
}
}
/*
* Map function will find the minimum center of the point and emit it to
* the reducer
*/@Overridepublicvoid map(LongWritable key, Text value,
OutputCollector<DoubleWritable, DoubleWritable> output,
Reporter reporter) throws IOException {
String line = value.toString();
double point = Double.parseDouble(line);
double min1, min2 = Double.MAX_VALUE, nearest_center = mCenters
.get(0);
// Find the minimum center from a pointfor (double c : mCenters) {
min1 = c - point;
if (Math.abs(min1) < Math.abs(min2)) {
nearest_center = c;
min2 = min1;
}
}
// Emit the nearest center and the point
output.collect(new DoubleWritable(nearest_center),
new DoubleWritable(point));
}
}
publicstaticclass Reduce extends MapReduceBase implements
Reducer<DoubleWritable, DoubleWritable, DoubleWritable, Text> {
/*
* Reduce function will emit all the points to that center and calculate
* the next center for these points
*/@Overridepublicvoid reduce(DoubleWritable key, Iterator<DoubleWritable> values,
OutputCollector<DoubleWritable, Text> output, Reporter reporter)
throws IOException {
double newCenter;
double sum = 0;
int no_elements = 0;
String points = "";
while (values.hasNext()) {
double d = values.next().get();
points = points + " " + Double.toString(d);
sum = sum + d;
++no_elements;
}
// We have new center now
newCenter = sum / no_elements;
// Emit new center and point
output.collect(new DoubleWritable(newCenter), new Text(points));
}
}
publicstaticvoid main(String[] args) throws Exception {
run(args);
}
publicstaticvoid run(String[] args) throws Exception {
IN = args[0];
OUT = args[1];
String input = IN;
String output = OUT + System.nanoTime();
String again_input = output;
// Reiterating till the convergenceint iteration = 0;
boolean isdone = false;
while (isdone == false) {
JobConf conf = new JobConf(KMeans.class);
if (iteration == 0) {
Path hdfsPath = new Path(input + CENTROID_FILE_NAME);
// upload the file to hdfs. Overwrite any existing copy.
DistributedCache.addCacheFile(hdfsPath.toUri(), conf);
} else {
Path hdfsPath = new Path(again_input + OUTPUT_FIE_NAME);
// upload the file to hdfs. Overwrite any existing copy.
DistributedCache.addCacheFile(hdfsPath.toUri(), conf);
}
conf.setJobName(JOB_NAME);
conf.setMapOutputKeyClass(DoubleWritable.class);
conf.setMapOutputValueClass(DoubleWritable.class);
conf.setOutputKeyClass(DoubleWritable.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(Map.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf,
new Path(input + DATA_FILE_NAME));
FileOutputFormat.setOutputPath(conf, new Path(output));
JobClient.runJob(conf);
Path ofile = new Path(output + OUTPUT_FIE_NAME);
FileSystem fs = FileSystem.get(new Configuration());
BufferedReader br = new BufferedReader(new InputStreamReader(
fs.open(ofile)));
List<Double> centers_next = new ArrayList<Double>();
String line = br.readLine();
while (line != null) {
String[] sp = line.split("\t| ");
double c = Double.parseDouble(sp[0]);
centers_next.add(c);
line = br.readLine();
}
br.close();
String prev;
if (iteration == 0) {
prev = input + CENTROID_FILE_NAME;
} else {
prev = again_input + OUTPUT_FILE_NAME;
}
Path prevfile = new Path(prev);
FileSystem fs1 = FileSystem.get(new Configuration());
BufferedReader br1 = new BufferedReader(new InputStreamReader(
fs1.open(prevfile)));
List<Double> centers_prev = new ArrayList<Double>();
String l = br1.readLine();
while (l != null) {
String[] sp1 = l.split(SPLITTER);
double d = Double.parseDouble(sp1[0]);
centers_prev.add(d);
l = br1.readLine();
}
br1.close();
// Sort the old centroid and new centroid and check for convergence// condition
Collections.sort(centers_next);
Collections.sort(centers_prev);
Iterator<Double> it = centers_prev.iterator();
for (double d : centers_next) {
double temp = it.next();
if (Math.abs(temp - d) <= 0.1) {
isdone = true;
} else {
isdone = false;
break;
}
}
++iteration;
again_input = output;
output = OUT + System.nanoTime();
}
}
}
HI I'M NEW IN HADOOP
I DON'T UNDERSTAND HOW TO USE GSON LIBRARY FOR MAPREDUCE PROCESSING.
I DON'T UNDERSTAND WHERE IS THE JAR FILES. CAN U HELP ME?
AND I HAVE ANOTHER QUESTION. CAN I INSTALL HIVE WIT THIS HADOOP?
DO YOU HAVE A USEFULL LINK TO INSTALL HIVE ON WINDOWS?
THANKS A LOT
I am following your tutorial but I can't find Java SDK 1.6. The link in the article doesn't work and when I try to search for it I can't find it on the Oracle website. Can you please help?
There is now a pre-compiled Hadoop on Windows. It doesn't require the "server" edition of windows (unlike HDP). It uses a bash shell so that it integrates seamlessly into a typical cluster of nodes allowing 'sh' scripts & such to run (without special consideration for being "a windows machine"). It's available at: zetabyte.tk/hadoop.php
Thanks for this. I was able to setup and run mapreduce program from JAR without any difficulties.
However, could you please guide us on how to configure eclipse to debug and run MapReduce program on this cluster.
I was not able to open the HDFS:// links on my browser and couldn't identify if namenode and datanodes are running (in pseudo distribution mode).
Hello sir,
i am doing project on hadoop and for that i have to use Yarn Scheduling load simulator.
i successfully installed hadoop 2.7.1 in win7 64 bit os. Namenode datanode etc running but dont know how to start sls, pls help me its urgent.
I followed this article for setting up hadoop. I got the following error.
C:\hadoop-2.3.0\bin>hadoop
The system cannot find the path specified.
Error: JAVA_HOME is incorrectly set.
Please update C:\hadoop-2.3.0\conf\hadoop-env.cmd
Usage: hadoop [--config confdir] COMMAND
where COMMAND is one of:
fs run a generic filesystem user client
version print the version
jar <jar> run a jar file
checknative [-a|-h] check native hadoop and compression libraries availabilit
y
distcp <srcurl> <desturl> copy file or directories recursively
archive -archiveName NAME -p <parent path> <src>* <dest> create a hadoop archi
ve
classpath prints the class path needed to get the
Hadoop jar and the required libraries
daemonlog get/set the log level for each daemon
or
CLASSNAME run the class named CLASSNAME
Most commands print help when invoked w/o parameters.
I have created JAVA_HOME and HADOOP_HOME environment variables. I have updated path variable as well. I have updated JAVA_HOME as "C:\Program Files\Java\jdk1.6.0_31" in Hadoop-env.cmd.
c:\hadoop-2.3.0\bin>hadoop namenode -format
DEPRECATED: Use of this script to execute hdfs command is deprecated.
Instead use the hdfs command for it.
Exception in thread "main" java.lang.NoClassDefFoundError: V
Caused by: java.lang.ClassNotFoundException: V
at java.net.URLClassLoader$1.run(URLClassLoader.java:202)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:190)
at java.lang.ClassLoader.loadClass(ClassLoader.java:306)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:301)
at java.lang.ClassLoader.loadClass(ClassLoader.java:247)
Could not find the main class: V. Program will exit.
I am getting following error while doing 'hadoop namenode -format',
VB.NET
Exception in thread "main" java.lang.NoClassDefFoundError: Krishna
Caused by: java.lang.ClassNotFoundException: Krishna
at java.net.URLClassLoader$1.run(URLClassLoader.java:202)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:190)
at java.lang.ClassLoader.loadClass(ClassLoader.java:306)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:301)
at java.lang.ClassLoader.loadClass(ClassLoader.java:247)
Could not find the main class: Krishna. Program will exit.
2.I tried to copy the recipeitems from the website suggested http://openrecipes.s3.amazonaws.com/recipeitems-latest.json.gz but can not open the file..Can you please let me know where i can find the recipeitems-latest.json file?
c:\hadoop-2.3.0\sbin>hadoop jar c:\Hwork\Recipe.jar Recipe /in /out
14/04/12 00:52:02 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
14/04/12 00:52:03 INFO input.FileInputFormat: Total input paths to process : 1
14/04/12 00:52:03 INFO mapreduce.JobSubmitter: number of splits:1
14/04/12 00:52:04 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1397243723769_0001
14/04/12 00:52:04 INFO impl.YarnClientImpl: Submitted application application_1397243723769_0001
14/04/12 00:52:04 INFO mapreduce.Job: The url to track the job: http://OmSkathi:8088/proxy/application_1397243723769_0001/
14/04/12 00:52:04 INFO mapreduce.Job: Running job: job_1397243723769_0001
After the above line system does not go farther. It got stuck. Please help me.
Thanks Praba very much for collecting all the relevant info and links for setting up HDFS, YARN and very practical example of MapReduce, this is one of the Best and most useful Article for setting up Hadoop on our laptop/PCs. I was able to setup and demo my own project on my laptop in less than an hour using this to my Business partner. I appreciate your all your efforts to make it step by step in such a great detail.
Hi
Im getting below errors while installing hadoop on my windows 7 64 bit. I dont know what is the problem?
in datanode
DEPRECATED: Use of this script to execute hdfs command is deprecated.
Instead use the hdfs command for it.
15/07/01 00:37:36 INFO datanode.DataNode: STARTUP_MSG:
STARTUP_MSG: java = 1.8.0_45
************************************************************/
15/07/01 00:37:36 WARN util.NativeCodeLoader: Unable to load native-hadoop libra
ry for your platform... using builtin-java classes where applicable
15/07/01 00:37:37 FATAL datanode.DataNode: Exception in secureMain
java.lang.UnsatisfiedLinkError: org.apache.hadoop.io.nativeio.NativeIO$Windows.a
ccess0(Ljava/lang/String;I)Z
at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:5
60)
at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:977)
at org.apache.hadoop.util.DiskChecker.checkAccessByFileMethods(DiskCheck
er.java:177)
at org.apache.hadoop.util.DiskChecker.checkDirAccess(DiskChecker.java:16
4)
at org.apache.hadoop.util.DiskChecker.checkDir(DiskChecker.java:147)
at org.apache.hadoop.hdfs.server.datanode.DataNode$DataNodeDiskChecker.c
heckDir(DataNode.java:1819)
at org.apache.hadoop.hdfs.server.datanode.DataNode.checkStorageLocations
(DataNode.java:1861)
at org.apache.hadoop.hdfs.server.datanode.DataNode.makeInstance(DataNode
.java:1843)
at org.apache.hadoop.hdfs.server.datanode.DataNode.instantiateDataNode(D
ataNode.java:1748)
at org.apache.hadoop.hdfs.server.datanode.DataNode.createDataNode(DataNo
de.java:1786)
at org.apache.hadoop.hdfs.server.datanode.DataNode.secureMain(DataNode.j
ava:1952)
at org.apache.hadoop.hdfs.server.datanode.DataNode.main(DataNode.java:19
73)
15/07/01 00:37:37 INFO util.ExitUtil: Exiting with status 1
15/07/01 00:37:37 INFO datanode.DataNode: SHUTDOWN_MSG:
/************************************************************
SHUTDOWN_MSG: Shutting down DataNode at EKA-PC/192.168.0.21
************************************************************/
C:\hadoop-2.3.0\sbin
in namenode
TARTUP_MSG: java = 1.8.0_45
************************************************************/
15/07/01 00:37:36 WARN util.NativeCodeLoader: Unable to load native-hadoop libra
ry for your platform... using builtin-java classes where applicable
15/07/01 00:37:37 FATAL datanode.DataNode: Exception in secureMain
java.lang.UnsatisfiedLinkError: org.apache.hadoop.io.nativeio.NativeIO$Windows.a
ccess0(Ljava/lang/String;I)Z
at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:5
60)
at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:977)
at org.apache.hadoop.util.DiskChecker.checkAccessByFileMethods(DiskCheck
er.java:177)
at org.apache.hadoop.util.DiskChecker.checkDirAccess(DiskChecker.java:16
4)
at org.apache.hadoop.util.DiskChecker.checkDir(DiskChecker.java:147)
at org.apache.hadoop.hdfs.server.datanode.DataNode$DataNodeDiskChecker.c
heckDir(DataNode.java:1819)
at org.apache.hadoop.hdfs.server.datanode.DataNode.checkStorageLocations
(DataNode.java:1861)
at org.apache.hadoop.hdfs.server.datanode.DataNode.makeInstance(DataNode
.java:1843)
at org.apache.hadoop.hdfs.server.datanode.DataNode.instantiateDataNode(D
ataNode.java:1748)
at org.apache.hadoop.hdfs.server.datanode.DataNode.createDataNode(DataNo
de.java:1786)
at org.apache.hadoop.hdfs.server.datanode.DataNode.secureMain(DataNode.j
ava:1952)
at org.apache.hadoop.hdfs.server.datanode.DataNode.main(DataNode.java:19
73)
15/07/01 00:37:37 INFO util.ExitUtil: Exiting with status 1
15/07/01 00:37:37 INFO datanode.DataNode: SHUTDOWN_MSG:
/************************************************************
SHUTDOWN_MSG: Shutting down DataNode at EKA-PC/192.168.0.21
************************************************************/
C:\hadoop-2.3.0\sbin>^A
Exception in thread "main" java.lang.NoClassDefFoundError: com/sun/tools/javac/M
ain
Caused by: java.lang.ClassNotFoundException: com.sun.tools.javac.Main
at java.net.URLClassLoader$1.run(Unknown Source)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(Unknown Source)
at java.lang.ClassLoader.loadClass(Unknown Source)
at sun.misc.Launcher$AppClassLoader.loadClass(Unknown Source)
at java.lang.ClassLoader.loadClass(Unknown Source)
Could not find the main class: com.sun.tools.javac.Main. Program will exit.