dummy

RAWCODE

1 XML

1.1 core-site

current: r3.1.1

core-default.xml

1.2 hdfs-site

current: r3.1.1

hdfs-default.xml

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
    <!-- dfs.nameservices 命名空间的逻辑名称,多个用,分割 -->
    <property>
        <name>dfs.nameservices</name>
        <value>bigha</value>
    </property>

    <!-- 指定ns1下有两个namenode,分别是nn1,nn2 -->
    <property>
        <name>dfs.ha.namenodes.bigha</name>
        <value>nn1,nn2</value>
    </property>

    <!-- 指定nn1的RPC通信地址 -->
    <property>
        <name>dfs.namenode.rpc-address.bigha.nn1</name>
        <value>node0:8020</value>
    </property>

    <!-- 指定nn1的HTTP通信地址 -->
    <property>
        <name>dfs.namenode.http-address.bigha.nn1</name>
        <value>node0:50070</value>
    </property>

    <!-- 指定nn2的RPC通信地址 -->
    <property>
        <name>dfs.namenode.rpc-address.bigha.nn2</name>
        <value>node1:8020</value>
    </property>

    <!-- 指定nn2的HTTP通信地址 -->
    <property>
        <name>dfs.namenode.http-address.bigha.nn2</name>
        <value>node1:50070</value>
    </property>

    <!-- 指定namenode的元数据存放的Journal Node的地址,必须基数,至少三个 -->
    <property>
        <name>dfs.namenode.shared.edits.dir</name>
        <value>qjournal://node2:8485;node3:8485;node4:8485/bigha</value>
    </property>

    <!--这是JournalNode进程保持逻辑状态的路径。这是在linux服务器文件的绝对路径-->
    <property>
        <name>dfs.journalnode.edits.dir</name>
        <value>/opt/ws/hadoop/journal/</value>
    </property>

    <!-- 开启namenode失败后自动切换 -->
    <property>
        <name>dfs.ha.automatic-failover.enabled</name>
        <value>true</value>
    </property>

    <!-- 配置失败自动切换实现方式 -->
    <property>
        <name>dfs.client.failover.proxy.provider.bigha</name>
        <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
    </property>

    <!-- 配置隔离机制方法,多个机制用换行分割 -->
    <property>
        <name>dfs.ha.fencing.methods</name>
        <value>
            sshfence
            shell(/bin/true)
        </value>
    </property>

    <!-- 使用sshfence隔离机制时需要ssh免登陆 -->
    <property>
        <name>dfs.ha.fencing.ssh.private-key-files</name>
        <value>/home/lidong/.ssh/id_rsa</value>
    </property>

    <!-- 配置sshfence隔离机制超时时间30秒 -->
    <property>
        <name>dfs.ha.fencing.ssh.connect-timeout</name>
        <value>30000</value>
    </property>

    <!-- 指定磁盘预留多少空间,防止磁盘被撑满用完,单位为bytes -->
    <property>
        <name>dfs.datanode.du.reserved</name>
        <value>2147483648</value>
    </property>

    <!--指定namenode名称空间的存储地址-->
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>file:///opt/ws/hadoop/hdfs/name</value>
    </property>

    <!--指定datanode数据存储地址-->
    <property>
        <name>dfs.datanode.data.dir</name>
        <value>file:///opt/ws/hadoop/hdfs/data</value>
    </property>

    <!--指定数据冗余份数-->
    <property>
        <name>dfs.replication</name>
        <value>3</value>
    </property>

    <!--指定可以通过web访问hdfs目录-->
    <property>
        <name>dfs.webhdfs.enabled</name>
        <value>true</value>
    </property>

    <!-- 处理namenode线程数 -->
    <property>
        <name>dfs.namenode.handler.count</name>
        <value>200</value>
        <description>The number of server threads for the namenode.</description>
    </property>

    <!-- 处理datanode线程数 -->
    <property>
        <name>dfs.datanode.handler.count</name>
        <value>200</value>
        <description>The number of server threads for the datanode.</description>
    </property>

    <!-- 数据传输最大线程数-->
    <property>
        <name>dfs.datanode.max.transfer.threads</name>
        <value>1024</value>
    </property>

    <!-- 设置块大小 -->
    <property>
        <name>dfs.blocksize</name>
        <value>5242880</value>
    </property>

    <!-- 设置日志节点写入超时时间 -->
    <property>
        <name>dfs.qjournal.write-txns.timeout.ms</name>
        <value>300000</value>
    </property>

<!--     <property>
   -         <name>dfs.namenode.fs-limits.min-block-size</name>
   -         <value>1048576</value>
   -     </property>
   -
   -     <property>
   -         <name>dfs.namenode.fs-limits.max-blocks-per-file</name>
   -         <value>1048576</value>
   -     </property> -->

</configuration>

1.3 mapred-site

current: r3.1.1

mapred-default.xml

1.4 yarn-site

current: r3.1.1

yarn-default.xml

<?xml version="1.0" encoding="UTF-8"?>

<configuration>

    <!-- 使能日志聚合 -->
    <property>
        <name>yarn.log-aggregation-enable</name>
        <value>true</value>
    </property>

    <!-- 聚合日志在DFS文件系统保留时间 -->
    <property>
        <name>yarn.log-aggregation.retain-seconds</name>
        <value>4320</value>
    </property>

    <!-- Aggregate log (bigha fs) -->
    <property>
        <name>yarn.nodemanager.remote-app-log-dir</name>
        <value>/tmp/logs</value>
    </property>

    <!--RM失联后重新链接的时间-->
    <property>
        <name>yarn.resourcemanager.connect.retry-interval.ms</name>
        <value>2000</value>
    </property>

    <!-- 设置zookeeper服务器地址 -->
    <property>
        <name>yarn.resourcemanager.zk-address</name>
        <value>node2:2181,node3:2181,node4:2181</value>
    </property>

    <!-- 不太懂: 集群ID, 确保RM不会作为其他集群的active -->
    <property>
        <name>yarn.resourcemanager.cluster-id</name>
        <value>bigcluster</value>
    </property>

    <!--开启RM HA -->
    <property>
        <name>yarn.resourcemanager.ha.enabled</name>
        <value>true</value>
    </property>

    <!-- RM的逻辑id列表 -->
    <property>
        <name>yarn.resourcemanager.ha.rm-ids</name>
        <value>rm1,rm2</value>
    </property>

    <!-- 每个rm-id的主机名 -->
    <property>
        <name>yarn.resourcemanager.hostname.rm1</name>
        <value>node0</value>
    </property>

    <!-- 每个rm-id的主机名 -->
    <property>
        <name>yarn.resourcemanager.hostname.rm2</name>
        <value>node5</value>
    </property>

    <property>
        <name>yarn.resourcemanager.address.rm1</name>
        <value>node0:8032</value>
    </property>

    <property>
        <name>yarn.resourcemanager.address.rm2</name>
        <value>node5:8032</value>
    </property>

    <property>
        <name>yarn.resourcemanager.scheduler.address.rm1</name>
        <value>node0:8030</value>
    </property>

    <property>
        <name>yarn.resourcemanager.scheduler.address.rm2</name>
        <value>node5:8030</value>
    </property>

    <property>
        <name>yarn.resourcemanager.resource-tracker.address.rm1</name>
        <value>node0:8031</value>
    </property>

    <property>
        <name>yarn.resourcemanager.resource-tracker.address.rm2</name>
        <value>node5:8031</value>
    </property>

    <property>
        <name>yarn.resourcemanager.webapp.address.rm1</name>
        <value>node0:8088</value>
    </property>

    <property>
        <name>yarn.resourcemanager.webapp.address.rm2</name>
        <value>node5:8088</value>
    </property>

    <!--开启故障自动切换-->
    <property>
        <name>yarn.resourcemanager.ha.automatic-failover.enabled</name>
        <value>true</value>
    </property>

    <property>
        <name>yarn.resourcemanager.ha.automatic-failover.embedded</name>
        <value>true</value>
    </property>

    <property>
        <name>yarn.resourcemanager.ha.automatic-failover.zk-base-path</name>
        <value>/yarn-leader-election</value>
    </property>

    <!--开启自动恢复功能-->
    <property>
        <name>yarn.resourcemanager.recovery.enabled</name>
        <value>true</value>
    </property>

    <!-- AM启动任务不会继承父进程的classpath, 可以通过该属性告知, 或者运行jar包 -libjar指定 -->
    <property>
        <name>yarn.application.classpath</name>
        <value>
            /opt/hadoop/,
            /opt/hadoop/etc/hadoop/*,
            /opt/hadoop/share/hadoop/common/*,/opt/hadoop/share/hadoop/common/lib/*,
            /opt/hadoop/share/hadoop/hdfs/*,/opt/hadoop/share/hadoop/hdfs/lib/*,
            /opt/hadoop/share/hadoop/mapreduce/*,/opt/hadoop/share/hadoop/mapreduce/lib/*,
            /opt/hadoop/share/hadoop/yarn/*,/opt/hadoop/share/hadoop/yarn/lib/*,
            /opt/hadoop/share/hadoop/tools/lib/*,
            /opt/hbase/conf/,/opt/hbase/lib/*
        </value>
    </property>

    <property>
        <name>yarn.resourcemanager.store.class</name>
        <value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
    </property>

    <!-- 设置zookeeper中数据存储目录 -->
    <property>
        <name>yarn.resourcemanager.zk-state-store.parent-path</name>
        <value>/rmstore</value>
    </property>

    <!-- Reducer取数据的方式是mapreduce_shuffle -->
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>

    <!-- <property>
       -     <name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
       -     <value>org.apache.hadoop.mapred.ShuffleHandler</value>
       - </property>     -->

    <!-- 总的可用物理内存量,默认是8096 -->
    <property>
        <name>yarn.nodemanager.resource.memory-mb</name>
        <value>1024</value>
    </property>

    <!-- 总的可用CPU数目,默认是8 -->
    <property>
        <name>yarn.nodemanager.resource.cpu-vcores</name>
        <value>1</value>
    </property>

    <!-- 最小可申请内存量,默认是1024 -->
    <property>
        <name>yarn.scheduler.minimum-allocation-mb</name>
        <value>256</value>
    </property>

    <!-- 最小可申请CPU数,默认是1 -->
    <property>
        <name>yarn.scheduler.minimum-allocation-vcores</name>
        <value>1</value>
    </property>

    <!-- 最大可申请内存量,默认是8096 -->
    <property>
        <name>yarn.scheduler.maximum-allocation-mb</name>
        <value>1024</value>
    </property>

    <!-- 最大可申请CPU数,默认是4 -->
    <property>
        <name>yarn.scheduler.maximum-allocation-vcores</name>
        <value>1</value>
    </property>

    <!-- 使能物理内存限制, 当大于mapreduce.reduce|map.memory.mb抛异常" -->
    <property>
        <name>yarn.nodemanager.pmem-check-enabled</name>
        <value>true</value>
    </property>

    <!-- 使能虚拟内存限制, 当大于yarn.nodemanager.vmem-pmem-ratio倍mapreduce.reduce|map.memory.mb抛异常 -->
    <property>
        <name>yarn.nodemanager.vmem-check-enabled</name>
        <value>true</value>
    </property>

    <!-- 设置虚拟内存与物理内存的倍数, 默认2.1 -->
    <property>
        <name>yarn.nodemanager.vmem-pmem-ratio</name>
        <value>6.0</value>
    </property>

    <!-- YARN 日志 -->
    <property>
        <name>yarn.log.server.url</name>
        <value>http://node0:19888/jobhistory/logs</value>
    </property>

</configuration>

1.5 hdbase-site

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
    <!-- 存储在HADOOP HDFS上文件根目录路径, 如果不是HA集群, 必须与core-site.xml文件配置保持完全一致 -->
    <property>
        <name>hbase.rootdir</name>
        <value>hdfs://bigha/hbase</value>
    </property>

    <property>
        <name>zookeeper.znode.parent</name>
        <value>/hbase</value>
    </property>

    <!-- 采用分布式模式 -->
    <property>
        <name>hbase.cluster.distributed</name>
        <value>true</value>
    </property>

    <!-- zookeeper地址,端口(默认为2181) -->
    <property>
        <name>hbase.zookeeper.quorum</name>
        <value>node2,node3,node4</value>
    </property>

    <!-- hbase临时文件存储目录,比如一些数据表的预分区信息等等 -->
    <property>
        <name>hbase.tmp.dir</name>
        <value>/opt/ws/hbase/tmp</value>
    </property>

    <!-- zookeeper存储数据位置(与zoo.cfg保持一致) -->
    <property>
        <name>hbase.zookeeper.property.dataDir</name>
        <value>/opt/ws/zookeeper/data</value>
    </property>

    <!-- 指定zk的连接端口 -->
    <property>
        <name>hbase.zookeeper.property.clientPort</name>
        <value>2181</value>
    </property>

    <!-- 设置Master并发最大线程数 -->
    <property>
        <name>hbase.regionserver.handler.count</name>
        <value>10</value>
    </property>

    <!-- RegionServer与Zookeeper间的连接超时时间。
      当超时时间到后,ReigonServer会被Zookeeper从RS集群清单中移除,HMaster收到移除通知后,
      会对这台server负责的regions重新balance,让其他存活的RegionServer接管. -->
    <property>
        <name>zookeeper.session.timeout</name>
        <value>30000</value>
    </property>

    <!--一个edit版本在内存中的cache时长,默认3600000毫秒-->
    <property>
        <name>hbase.regionserver.optionalcacheflushinterval</name>
        <value>7200000</value>
    </property>

    <!--分配给HFile/StoreFile的block cache占最大堆(-Xmx setting)的比例。默认0.4意思是分配40%,设置为0就是禁用,但不推荐。-->
    <property>
        <name>hfile.block.cache.size</name>
        <value>0.3</value>
    </property>

    <!-- 设置HStoreFile的大小,当大于这个数时,就会split 成两个文件 -->
    <property>
        <name>hbase.hregion.max.filesize</name>
        <value>134217728</value>
    </property>

    <!--设置memstore的大小,当大于这个值时,写入磁盘-->
    <property>
        <name>hbase.hregion.memstore.flush.size</name>
        <value>134217728</value>
    </property>

    <!-- 设置HDFS客户端最大超时时间,尽量改大 -->
    <property>
        <name>dfs.client.socket-timeout</name>
        <value>60000 </value>
    </property>

    <!-- 端口默认:
      -     <property >
      -         <name>hbase.master.port</name>
      -         <value>60000</value>
      -     </property>
      -
      -     <property>
      -         <name>hbase.master.info.port</name>
      -         <value>60010</value>
      -     </property>
      -
      -     <property>
      -         <name>hbase.regionserver.port</name>
      -         <value>60020</value>
      -     </property>
      -
      -     <property>
      -         <name>hbase.regionserver.info.port</name>
      -         <value>60030</value>
      -     </property>
      -  -->
</configuration>

2 Default Ports

PORT CONFIG NAME CONFIG VALUE
0 dfs.balancer.address 0.0.0.0:0
9866 dfs.datanode.address 0.0.0.0:9866
9864 dfs.datanode.http.address 0.0.0.0:9864
9865 dfs.datanode.https.address 0.0.0.0:9865
9867 dfs.datanode.ipc.address 0.0.0.0:9867
8111 dfs.federation.router.admin-address 0.0.0.0:8111
50071 dfs.federation.router.http-address 0.0.0.0:50071
50072 dfs.federation.router.https-address 0.0.0.0:50072
8888 dfs.federation.router.rpc-address 0.0.0.0:8888
8480 dfs.journalnode.http-address 0.0.0.0:8480
8481 dfs.journalnode.https-address 0.0.0.0:8481
8485 dfs.journalnode.rpc-address 0.0.0.0:8485
0 dfs.mover.address 0.0.0.0:0
50100 dfs.namenode.backup.address 0.0.0.0:50100
50105 dfs.namenode.backup.http-address 0.0.0.0:50105
9870 dfs.namenode.http-address 0.0.0.0:9870
9871 dfs.namenode.https-address 0.0.0.0:9871
9868 dfs.namenode.secondary.http-address 0.0.0.0:9868
9869 dfs.namenode.secondary.https-address 0.0.0.0:9869
50200 dfs.provided.aliasmap.inmemory.dnrpc-address 0.0.0.0:50200
2181 hadoop.registry.zk.quorum localhost:2181
10020 mapreduce.jobhistory.address 0.0.0.0:10020
10033 mapreduce.jobhistory.admin.address 0.0.0.0:10033
19888 mapreduce.jobhistory.webapp.address 0.0.0.0:19888
19890 mapreduce.jobhistory.webapp.https.address 0.0.0.0:19890
0 yarn.nodemanager.address ${yarn.nodemanager.hostname}:0
8049 yarn.nodemanager.amrmproxy.address 0.0.0.0:8049
8048 yarn.nodemanager.collector-service.address ${yarn.nodemanager.hostname}:8048
8040 yarn.nodemanager.localizer.address ${yarn.nodemanager.hostname}:8040
8042 yarn.nodemanager.webapp.address ${yarn.nodemanager.hostname}:8042
8044 yarn.nodemanager.webapp.https.address 0.0.0.0:8044
8032 yarn.resourcemanager.address ${yarn.resourcemanager.hostname}:8032
8033 yarn.resourcemanager.admin.address ${yarn.resourcemanager.hostname}:8033
8031 yarn.resourcemanager.resource-tracker.address ${yarn.resourcemanager.hostname}:8031
8030 yarn.resourcemanager.scheduler.address ${yarn.resourcemanager.hostname}:8030
8088 yarn.resourcemanager.webapp.address ${yarn.resourcemanager.hostname}:8088
8090 yarn.resourcemanager.webapp.https.address ${yarn.resourcemanager.hostname}:8090
8089 yarn.router.webapp.address 0.0.0.0:8089
8091 yarn.router.webapp.https.address 0.0.0.0:8091
8047 yarn.sharedcache.admin.address 0.0.0.0:8047
8045 yarn.sharedcache.client-server.address 0.0.0.0:8045
8046 yarn.sharedcache.uploader.server.address 0.0.0.0:8046
8788 yarn.sharedcache.webapp.address 0.0.0.0:8788
10200 yarn.timeline-service.address ${yarn.timeline-service.hostname}:10200
8188 yarn.timeline-service.webapp.address ${yarn.timeline-service.hostname}:8188
8190 yarn.timeline-service.webapp.https.address ${yarn.timeline-service.hostname}:8190