1 XML
1.1 core-site
current: r3.1.1
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<!-- 指定hdfs的nameservice为bigha(hdfs-site.xml指定), 端口号默认9000 -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://bigha</value>
</property>
<!-- 指定hadoop运行时产生文件的存储路径 -->
<property>
<name>hadoop.tmp.dir</name>
<value>/opt/ws/hadoop/tmp</value>
</property>
<!-- 删除的文件垃圾箱存放时间(以分钟为单位) -->
<property>
<name>fs.trash.interval</name>
<value>1440</value>
</property>
<!-- 来设置SequenceFile中用到的读/写缓存大小(一页4k的倍数, 字节为单位) -->
<property>
<name>io.file.buffer.size</name>
<value>65536</value>
</property>
<!-- 指定zookeeper地址,多个用,分割 -->
<property>
<name>ha.zookeeper.quorum</name>
<value>node2:2181,node3:2181,node4:2181</value>
</property>
<!-- 设置zookeeper 心跳超时时间 -->
<property>
<name>ha.zookeeper.session-timeout.ms</name>
<value>300000</value>
</property>
</configuration>
1.2 hdfs-site
current: r3.1.1
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<!-- dfs.nameservices 命名空间的逻辑名称,多个用,分割 -->
<property>
<name>dfs.nameservices</name>
<value>bigha</value>
</property>
<!-- 指定ns1下有两个namenode,分别是nn1,nn2 -->
<property>
<name>dfs.ha.namenodes.bigha</name>
<value>nn1,nn2</value>
</property>
<!-- 指定nn1的RPC通信地址 -->
<property>
<name>dfs.namenode.rpc-address.bigha.nn1</name>
<value>node0:8020</value>
</property>
<!-- 指定nn1的HTTP通信地址 -->
<property>
<name>dfs.namenode.http-address.bigha.nn1</name>
<value>node0:50070</value>
</property>
<!-- 指定nn2的RPC通信地址 -->
<property>
<name>dfs.namenode.rpc-address.bigha.nn2</name>
<value>node1:8020</value>
</property>
<!-- 指定nn2的HTTP通信地址 -->
<property>
<name>dfs.namenode.http-address.bigha.nn2</name>
<value>node1:50070</value>
</property>
<!-- 指定namenode的元数据存放的Journal Node的地址,必须基数,至少三个 -->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://node2:8485;node3:8485;node4:8485/bigha</value>
</property>
<!--这是JournalNode进程保持逻辑状态的路径。这是在linux服务器文件的绝对路径-->
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/opt/ws/hadoop/journal/</value>
</property>
<!-- 开启namenode失败后自动切换 -->
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
<!-- 配置失败自动切换实现方式 -->
<property>
<name>dfs.client.failover.proxy.provider.bigha</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<!-- 配置隔离机制方法,多个机制用换行分割 -->
<property>
<name>dfs.ha.fencing.methods</name>
<value>
sshfence
shell(/bin/true)
</value>
</property>
<!-- 使用sshfence隔离机制时需要ssh免登陆 -->
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/home/lidong/.ssh/id_rsa</value>
</property>
<!-- 配置sshfence隔离机制超时时间30秒 -->
<property>
<name>dfs.ha.fencing.ssh.connect-timeout</name>
<value>30000</value>
</property>
<!-- 指定磁盘预留多少空间,防止磁盘被撑满用完,单位为bytes -->
<property>
<name>dfs.datanode.du.reserved</name>
<value>2147483648</value>
</property>
<!--指定namenode名称空间的存储地址-->
<property>
<name>dfs.namenode.name.dir</name>
<value>file:///opt/ws/hadoop/hdfs/name</value>
</property>
<!--指定datanode数据存储地址-->
<property>
<name>dfs.datanode.data.dir</name>
<value>file:///opt/ws/hadoop/hdfs/data</value>
</property>
<!--指定数据冗余份数-->
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<!--指定可以通过web访问hdfs目录-->
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
<!-- 处理namenode线程数 -->
<property>
<name>dfs.namenode.handler.count</name>
<value>200</value>
<description>The number of server threads for the namenode.</description>
</property>
<!-- 处理datanode线程数 -->
<property>
<name>dfs.datanode.handler.count</name>
<value>200</value>
<description>The number of server threads for the datanode.</description>
</property>
<!-- 数据传输最大线程数-->
<property>
<name>dfs.datanode.max.transfer.threads</name>
<value>1024</value>
</property>
<!-- 设置块大小 -->
<property>
<name>dfs.blocksize</name>
<value>5242880</value>
</property>
<!-- 设置日志节点写入超时时间 -->
<property>
<name>dfs.qjournal.write-txns.timeout.ms</name>
<value>300000</value>
</property>
<!-- <property>
- <name>dfs.namenode.fs-limits.min-block-size</name>
- <value>1048576</value>
- </property>
-
- <property>
- <name>dfs.namenode.fs-limits.max-blocks-per-file</name>
- <value>1048576</value>
- </property> -->
</configuration>
1.3 mapred-site
current: r3.1.1
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<!-- 框架MR运行在YARN -->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<!-- 设置每个job的map任务数 -->
<property>
<name>mapreduce.job.maps</name>
<value>4</value>
</property>
<!-- 设置每个job的reduce任务数 -->
<property>
<name>mapreduce.job.reduces</name>
<value>4</value>
</property>
<!-- 实际物理内存量,默认是1024 -->
<property>
<name>mapreduce.map.memory.mb</name>
<value>1024</value>
</property>
<property>
<name>mapreduce.reduce.memory.mb</name>
<value>1024</value>
</property>
<!-- 设置每个任务的JVM参数, 默认是-Xmx200m (80% of memory.mb) -->
<property>
<name>mapreduce.map.java.opts</name>
<value>-Xmx200m</value>
</property>
<property>
<name>mapreduce.reduce.java.opts</name>
<value>-Xmx200m</value>
</property>
<!-- CPU数目,默认是1 -->
<!-- <property>
- <name>mapreduce.map.cpu.vcores</name>
- <value>1</value>
- </property> -->
<!-- <property>
- <name>mapreduce.reduce.cpu.vcores</name>
- <value>1</value>
- </property> -->
<!-- 设置AppMaster内存 -->
<property>
<name>yarn.app.mapreduce.am.resource.mb</name>
<value>512</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>node0:10020</value>
</property>
<!-- 设置WEB访问jobhistory -->
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>node0:19888</value>
</property>
</configuration>
1.4 yarn-site
current: r3.1.1
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<!-- 使能日志聚合 -->
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<!-- 聚合日志在DFS文件系统保留时间 -->
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>4320</value>
</property>
<!-- Aggregate log (bigha fs) -->
<property>
<name>yarn.nodemanager.remote-app-log-dir</name>
<value>/tmp/logs</value>
</property>
<!--RM失联后重新链接的时间-->
<property>
<name>yarn.resourcemanager.connect.retry-interval.ms</name>
<value>2000</value>
</property>
<!-- 设置zookeeper服务器地址 -->
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>node2:2181,node3:2181,node4:2181</value>
</property>
<!-- 不太懂: 集群ID, 确保RM不会作为其他集群的active -->
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>bigcluster</value>
</property>
<!--开启RM HA -->
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<!-- RM的逻辑id列表 -->
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
</property>
<!-- 每个rm-id的主机名 -->
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>node0</value>
</property>
<!-- 每个rm-id的主机名 -->
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>node5</value>
</property>
<property>
<name>yarn.resourcemanager.address.rm1</name>
<value>node0:8032</value>
</property>
<property>
<name>yarn.resourcemanager.address.rm2</name>
<value>node5:8032</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm1</name>
<value>node0:8030</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm2</name>
<value>node5:8030</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm1</name>
<value>node0:8031</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm2</name>
<value>node5:8031</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm1</name>
<value>node0:8088</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm2</name>
<value>node5:8088</value>
</property>
<!--开启故障自动切换-->
<property>
<name>yarn.resourcemanager.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.ha.automatic-failover.embedded</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.ha.automatic-failover.zk-base-path</name>
<value>/yarn-leader-election</value>
</property>
<!--开启自动恢复功能-->
<property>
<name>yarn.resourcemanager.recovery.enabled</name>
<value>true</value>
</property>
<!-- AM启动任务不会继承父进程的classpath, 可以通过该属性告知, 或者运行jar包 -libjar指定 -->
<property>
<name>yarn.application.classpath</name>
<value>
/opt/hadoop/,
/opt/hadoop/etc/hadoop/*,
/opt/hadoop/share/hadoop/common/*,/opt/hadoop/share/hadoop/common/lib/*,
/opt/hadoop/share/hadoop/hdfs/*,/opt/hadoop/share/hadoop/hdfs/lib/*,
/opt/hadoop/share/hadoop/mapreduce/*,/opt/hadoop/share/hadoop/mapreduce/lib/*,
/opt/hadoop/share/hadoop/yarn/*,/opt/hadoop/share/hadoop/yarn/lib/*,
/opt/hadoop/share/hadoop/tools/lib/*,
/opt/hbase/conf/,/opt/hbase/lib/*
</value>
</property>
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
</property>
<!-- 设置zookeeper中数据存储目录 -->
<property>
<name>yarn.resourcemanager.zk-state-store.parent-path</name>
<value>/rmstore</value>
</property>
<!-- Reducer取数据的方式是mapreduce_shuffle -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- <property>
- <name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
- <value>org.apache.hadoop.mapred.ShuffleHandler</value>
- </property> -->
<!-- 总的可用物理内存量,默认是8096 -->
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>1024</value>
</property>
<!-- 总的可用CPU数目,默认是8 -->
<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>1</value>
</property>
<!-- 最小可申请内存量,默认是1024 -->
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>256</value>
</property>
<!-- 最小可申请CPU数,默认是1 -->
<property>
<name>yarn.scheduler.minimum-allocation-vcores</name>
<value>1</value>
</property>
<!-- 最大可申请内存量,默认是8096 -->
<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>1024</value>
</property>
<!-- 最大可申请CPU数,默认是4 -->
<property>
<name>yarn.scheduler.maximum-allocation-vcores</name>
<value>1</value>
</property>
<!-- 使能物理内存限制, 当大于mapreduce.reduce|map.memory.mb抛异常" -->
<property>
<name>yarn.nodemanager.pmem-check-enabled</name>
<value>true</value>
</property>
<!-- 使能虚拟内存限制, 当大于yarn.nodemanager.vmem-pmem-ratio倍mapreduce.reduce|map.memory.mb抛异常 -->
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>true</value>
</property>
<!-- 设置虚拟内存与物理内存的倍数, 默认2.1 -->
<property>
<name>yarn.nodemanager.vmem-pmem-ratio</name>
<value>6.0</value>
</property>
<!-- YARN 日志 -->
<property>
<name>yarn.log.server.url</name>
<value>http://node0:19888/jobhistory/logs</value>
</property>
</configuration>
1.5 hdbase-site
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<!-- 存储在HADOOP HDFS上文件根目录路径, 如果不是HA集群, 必须与core-site.xml文件配置保持完全一致 -->
<property>
<name>hbase.rootdir</name>
<value>hdfs://bigha/hbase</value>
</property>
<property>
<name>zookeeper.znode.parent</name>
<value>/hbase</value>
</property>
<!-- 采用分布式模式 -->
<property>
<name>hbase.cluster.distributed</name>
<value>true</value>
</property>
<!-- zookeeper地址,端口(默认为2181) -->
<property>
<name>hbase.zookeeper.quorum</name>
<value>node2,node3,node4</value>
</property>
<!-- hbase临时文件存储目录,比如一些数据表的预分区信息等等 -->
<property>
<name>hbase.tmp.dir</name>
<value>/opt/ws/hbase/tmp</value>
</property>
<!-- zookeeper存储数据位置(与zoo.cfg保持一致) -->
<property>
<name>hbase.zookeeper.property.dataDir</name>
<value>/opt/ws/zookeeper/data</value>
</property>
<!-- 指定zk的连接端口 -->
<property>
<name>hbase.zookeeper.property.clientPort</name>
<value>2181</value>
</property>
<!-- 设置Master并发最大线程数 -->
<property>
<name>hbase.regionserver.handler.count</name>
<value>10</value>
</property>
<!-- RegionServer与Zookeeper间的连接超时时间。
当超时时间到后,ReigonServer会被Zookeeper从RS集群清单中移除,HMaster收到移除通知后,
会对这台server负责的regions重新balance,让其他存活的RegionServer接管. -->
<property>
<name>zookeeper.session.timeout</name>
<value>30000</value>
</property>
<!--一个edit版本在内存中的cache时长,默认3600000毫秒-->
<property>
<name>hbase.regionserver.optionalcacheflushinterval</name>
<value>7200000</value>
</property>
<!--分配给HFile/StoreFile的block cache占最大堆(-Xmx setting)的比例。默认0.4意思是分配40%,设置为0就是禁用,但不推荐。-->
<property>
<name>hfile.block.cache.size</name>
<value>0.3</value>
</property>
<!-- 设置HStoreFile的大小,当大于这个数时,就会split 成两个文件 -->
<property>
<name>hbase.hregion.max.filesize</name>
<value>134217728</value>
</property>
<!--设置memstore的大小,当大于这个值时,写入磁盘-->
<property>
<name>hbase.hregion.memstore.flush.size</name>
<value>134217728</value>
</property>
<!-- 设置HDFS客户端最大超时时间,尽量改大 -->
<property>
<name>dfs.client.socket-timeout</name>
<value>60000 </value>
</property>
<!-- 端口默认:
- <property >
- <name>hbase.master.port</name>
- <value>60000</value>
- </property>
-
- <property>
- <name>hbase.master.info.port</name>
- <value>60010</value>
- </property>
-
- <property>
- <name>hbase.regionserver.port</name>
- <value>60020</value>
- </property>
-
- <property>
- <name>hbase.regionserver.info.port</name>
- <value>60030</value>
- </property>
- -->
</configuration>
2 Default Ports
PORT | CONFIG NAME | CONFIG VALUE |
---|---|---|
0 | dfs.balancer.address | 0.0.0.0:0 |
9866 | dfs.datanode.address | 0.0.0.0:9866 |
9864 | dfs.datanode.http.address | 0.0.0.0:9864 |
9865 | dfs.datanode.https.address | 0.0.0.0:9865 |
9867 | dfs.datanode.ipc.address | 0.0.0.0:9867 |
8111 | dfs.federation.router.admin-address | 0.0.0.0:8111 |
50071 | dfs.federation.router.http-address | 0.0.0.0:50071 |
50072 | dfs.federation.router.https-address | 0.0.0.0:50072 |
8888 | dfs.federation.router.rpc-address | 0.0.0.0:8888 |
8480 | dfs.journalnode.http-address | 0.0.0.0:8480 |
8481 | dfs.journalnode.https-address | 0.0.0.0:8481 |
8485 | dfs.journalnode.rpc-address | 0.0.0.0:8485 |
0 | dfs.mover.address | 0.0.0.0:0 |
50100 | dfs.namenode.backup.address | 0.0.0.0:50100 |
50105 | dfs.namenode.backup.http-address | 0.0.0.0:50105 |
9870 | dfs.namenode.http-address | 0.0.0.0:9870 |
9871 | dfs.namenode.https-address | 0.0.0.0:9871 |
9868 | dfs.namenode.secondary.http-address | 0.0.0.0:9868 |
9869 | dfs.namenode.secondary.https-address | 0.0.0.0:9869 |
50200 | dfs.provided.aliasmap.inmemory.dnrpc-address | 0.0.0.0:50200 |
2181 | hadoop.registry.zk.quorum | localhost:2181 |
10020 | mapreduce.jobhistory.address | 0.0.0.0:10020 |
10033 | mapreduce.jobhistory.admin.address | 0.0.0.0:10033 |
19888 | mapreduce.jobhistory.webapp.address | 0.0.0.0:19888 |
19890 | mapreduce.jobhistory.webapp.https.address | 0.0.0.0:19890 |
0 | yarn.nodemanager.address | ${yarn.nodemanager.hostname}:0 |
8049 | yarn.nodemanager.amrmproxy.address | 0.0.0.0:8049 |
8048 | yarn.nodemanager.collector-service.address | ${yarn.nodemanager.hostname}:8048 |
8040 | yarn.nodemanager.localizer.address | ${yarn.nodemanager.hostname}:8040 |
8042 | yarn.nodemanager.webapp.address | ${yarn.nodemanager.hostname}:8042 |
8044 | yarn.nodemanager.webapp.https.address | 0.0.0.0:8044 |
8032 | yarn.resourcemanager.address | ${yarn.resourcemanager.hostname}:8032 |
8033 | yarn.resourcemanager.admin.address | ${yarn.resourcemanager.hostname}:8033 |
8031 | yarn.resourcemanager.resource-tracker.address | ${yarn.resourcemanager.hostname}:8031 |
8030 | yarn.resourcemanager.scheduler.address | ${yarn.resourcemanager.hostname}:8030 |
8088 | yarn.resourcemanager.webapp.address | ${yarn.resourcemanager.hostname}:8088 |
8090 | yarn.resourcemanager.webapp.https.address | ${yarn.resourcemanager.hostname}:8090 |
8089 | yarn.router.webapp.address | 0.0.0.0:8089 |
8091 | yarn.router.webapp.https.address | 0.0.0.0:8091 |
8047 | yarn.sharedcache.admin.address | 0.0.0.0:8047 |
8045 | yarn.sharedcache.client-server.address | 0.0.0.0:8045 |
8046 | yarn.sharedcache.uploader.server.address | 0.0.0.0:8046 |
8788 | yarn.sharedcache.webapp.address | 0.0.0.0:8788 |
10200 | yarn.timeline-service.address | ${yarn.timeline-service.hostname}:10200 |
8188 | yarn.timeline-service.webapp.address | ${yarn.timeline-service.hostname}:8188 |
8190 | yarn.timeline-service.webapp.https.address | ${yarn.timeline-service.hostname}:8190 |