Differences between revisions 1 and 6 (spanning 5 versions)

Apache Hadoop

Apache Hadoop project develops open-source software for reliable, scalable, distributed computing.

The Apache Hadoop software library is a framework that allows for the distributed processing of large data sets across clusters of computers using simple programming models. It is designed to scale up from single servers to thousands of machines, each offering local computation and storage.

Hadoop modules:

Hadoop Common
- The common utilities that support the other Hadoop modules.
Hadoop Distributed File System (HDFS)
- A distributed file system that provides high-throughput access to application data.
Hadoop YARN
- A framework for job scheduling and cluster resource management.
Hadoop MapReduce
- A YARN-based system for parallel processing of large data sets.

Install

   1 cd ~/tmp
   2 wget https://archive.apache.org/dist/hadoop/core/hadoop-3.3.1/hadoop-3.3.1.tar.gz
   3 tar tvzf hadoop-3.3.1.tar.gz
   4 tar xvzf hadoop-3.3.1.tar.gz

~/tmp/hadoop-3.3.1/etc/hadoop/hadoop-env.sh

   1 export JAVA_HOME=/home/vitor/jdk-11.0.10+9

~/tmp/hadoop-3.3.1/etc/hadoop/core-site.xml

   1 <?xml version="1.0" encoding="UTF-8"?>
   2 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
   3 <configuration>
   4   <property>
   5     <name>fs.default.name</name>
   6     <value>hdfs://master:9000</value>
   7   </property>
   8 </configuration>

~/tmp/hadoop-3.3.1/etc/hadoop/hdfs-site.xml

   1 <?xml version="1.0" encoding="UTF-8"?>
   2 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
   3 <configuration>
   4   <property>
   5     <name>dfs.namenode.name.dir</name>
   6     <value>/tmp/nameNode</value>
   7   </property>
   8   <property>
   9     <name>dfs.namenode.data.dir</name>
  10     <value>/tmp/dataNode</value>
  11   </property>
  12   <property>
  13     <name>dfs.replication</name>
  14     <value>2</value>
  15    </property>
  16 </configuration>

~/tmp/hadoop-3.3.1/etc/hadoop/mapred-site.xml

   1 <?xml version="1.0"?>
   2 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
   3 <configuration>
   4   <property>
   5     <name>mapreduce.framework.name</name>
   6     <value>yarn</value>
   7   </property>
   8 </configuration>

~/tmp/hadoop-3.3.1/etc/hadoop/yarn-site.xml

   1 <?xml version="1.0"?>
   2 <configuration>
   3   <property>
   4     <name>yarn.acl.enable</name>
   5     <value>0</value>
   6   </property>
   7   <property>
   8     <name>yarn.resourcemanager.hostname</name>
   9     <value>master</value>
  10   </property>
  11   <property>
  12     <name>yarn.nodemanager.aux-services</name>
  13     <value>mapreduce_shuffle</value>
  14   </property>
  15 </configuration>

~/tmp/hadoop-3.3.1/etc/hadoop/slaves

   1 localhost

~/.bashrc

   1 export HADOOP_HOME=/home/vitor/tmp/hadoop-3.3.1
   2 export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin

Setup HDFS

   1 ls $HADOOP_HOME/bin/
   2 hdfs dfsadmin -report 
   3 # 2021-07-23 14:07:17,010 WARN fs.FileSystem: Failed to initialize fileystem hdfs://master:9000: 
   4 # java.lang.IllegalArgumentException: java.net.UnknownHostException: master
   5 # report: java.net.UnknownHostException: master
   6 # add 127.0.0.1 master to /etc/hosts
   7 hdfs namenode -format master
   8 hdfs --daemon start namenode
   9 hdfs --daemon start datanode
  10 yarn --daemon start resourcemanager
  11 yarn --daemon start nodemanager
  12 yarn --daemon start proxyserver
  13 mapred --daemon start historyserver
  14 hdfs dfsadmin -report
  15 # http://localhost:9870/
  16 # http://localhost:9870/dfshealth.html#tab-overview
  17 # http://localhost:9870/explorer.html#
  18 # http://localhost:8088/
  19 # http://localhost:8088/cluster
  20 # http://localhost:19888/
  21 hadoop fs -ls /
  22 hadoop fs -ls /tmp 
  23 hadoop fs -mkdir /test
  24 hadoop fs -ls /

-  ⇤ ← Revision 1 as of 2021-07-23 11:58:00 → 
  Size: 447
  Editor: localhost
  Comment:
+   ← Revision 6 as of 2023-05-26 13:15:20 → ⇥
  Size: 3899
  Editor: 127
  Comment:
-Deletions are marked like this.
+Additions are marked like this.
 Line 4:
-The Apache Hadoop software library is a framework that allows for the distributed processing of large data sets across clusters of computers using simple programming models. It is designed to scale up from single servers to thousands of machines, each offering local computation and storage.
+The Apache Hadoop software library is a framework that allows for the '''distributed processing of large data sets''' across clusters of computers using simple programming models. It is designed to scale up from single servers to thousands of machines, each offering local computation and storage. 

Hadoop modules:
 * Hadoop Common 
  * The common utilities that support the other Hadoop modules.
 * Hadoop Distributed File System (HDFS)
  * A '''distributed file system''' that provides high-throughput access to application data.
 * Hadoop YARN
  * A framework for '''job scheduling''' and cluster resource management.
 * Hadoop MapReduce
  * A YARN-based system for '''parallel processing of large data sets'''.


== Install ==
{{{#!highlight bash
cd ~/tmp
wget https://archive.apache.org/dist/hadoop/core/hadoop-3.3.1/hadoop-3.3.1.tar.gz
tar tvzf hadoop-3.3.1.tar.gz
tar xvzf hadoop-3.3.1.tar.gz
}}}

=== ~/tmp/hadoop-3.3.1/etc/hadoop/hadoop-env.sh ===
{{{#!highlight bash
export JAVA_HOME=/home/vitor/jdk-11.0.10+9
}}}

=== ~/tmp/hadoop-3.3.1/etc/hadoop/core-site.xml ===
{{{#!highlight xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
  <property>
    <name>fs.default.name</name>
    <value>hdfs://master:9000</value>
  </property>
</configuration>
}}}

=== ~/tmp/hadoop-3.3.1/etc/hadoop/hdfs-site.xml ===
{{{#!highlight xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
  <property>
    <name>dfs.namenode.name.dir</name>
    <value>/tmp/nameNode</value>
  </property>
  <property>
    <name>dfs.namenode.data.dir</name>
    <value>/tmp/dataNode</value>
  </property>
  <property>
    <name>dfs.replication</name>
    <value>2</value>
   </property>
</configuration>
}}}

=== ~/tmp/hadoop-3.3.1/etc/hadoop/mapred-site.xml ===
{{{#!highlight xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
  <property>
    <name>mapreduce.framework.name</name>
    <value>yarn</value>
  </property>
</configuration>
}}}

=== ~/tmp/hadoop-3.3.1/etc/hadoop/yarn-site.xml ===
{{{#!highlight xml
<?xml version="1.0"?>
<configuration>
  <property>
    <name>yarn.acl.enable</name>
    <value>0</value>
  </property>
  <property>
    <name>yarn.resourcemanager.hostname</name>
    <value>master</value>
  </property>
  <property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
  </property>
</configuration>
}}}

===  ~/tmp/hadoop-3.3.1/etc/hadoop/slaves ===
{{{#!highlight bash
localhost
}}}

=== ~/.bashrc ===
{{{#!highlight bash
export HADOOP_HOME=/home/vitor/tmp/hadoop-3.3.1
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
}}}

=== Setup HDFS ===
{{{#!highlight bash
ls $HADOOP_HOME/bin/
hdfs dfsadmin -report 
# 2021-07-23 14:07:17,010 WARN fs.FileSystem: Failed to initialize fileystem hdfs://master:9000: 
# java.lang.IllegalArgumentException: java.net.UnknownHostException: master
# report: java.net.UnknownHostException: master
# add 127.0.0.1 master to /etc/hosts
hdfs namenode -format master
hdfs --daemon start namenode
hdfs --daemon start datanode
yarn --daemon start resourcemanager
yarn --daemon start nodemanager
yarn --daemon start proxyserver
mapred --daemon start historyserver
hdfs dfsadmin -report
# http://localhost:9870/
# http://localhost:9870/dfshealth.html#tab-overview
# http://localhost:9870/explorer.html#
# http://localhost:8088/
# http://localhost:8088/cluster
# http://localhost:19888/
hadoop fs -ls /
hadoop fs -ls /tmp 
hadoop fs -mkdir /test
hadoop fs -ls /
}}}