命令行部署#
前提#
LittleBoy前端服务需要依赖Nginx服务
Nginx安装部署请参考:负载均衡 Nginx安装。
首先登录到oushu1,然后切换到root用户
ssh oushu1
su - root
创建一个serverhost
文件,包含Spark集群中所有的机器
cat > ${HOME}/serverhost << EOF
oushu1
oushu2
EOF
创建一个lbhost
文件,包含Spark集群中所有的master机器
cat > ${HOME}/lbhost << EOF
oushu1
oushu2
oushu3
EOF
在oushu1节点配置yum源,安装lava命令行管理工具
# 从yum源所在机器获取repo文件
scp oushu@192.168.1.10:/etc/yum.repos.d/oushu.repo /etc/yum.repos.d/oushu.repo
# 追加yum源所在机器信息到/etc/hosts文件
# 安装lava命令行管理工具
yum clean all
yum makecache
yum install lava
oushu1节点和集群内其他节点交换公钥,以便ssh免密码登陆和分发配置文件。
lava ssh-exkeys -f ${HOME}/lbhost -p ********
分发repo文件到其他机器
lava scp -f ${HOME}/lbhost /etc/yum.repos.d/oushu.repo =:/etc/yum.repos.d
安装#
安装LittleBoy前端服务,只需在oushu1安装即可
yum install littleboy-fe
安装LittleBoy Server/Worker
lava ssh -f ${HOME}/lbhost -e "yum install -y littleboy"
配置#
配置LittleBoy前端服务#
/usr/local/nginx/conf/premise-frontend.conf
配置文件中添加前端nginx服务
# littleboy
server {
listen 1895;
listen [::]:1895;
server_name _;
gzip_static on;
error_page 404 /404.html;
# redirect server error pages to the static page /50x.html
error_page 500 502 503 504 /50x.html;
location = /50x.html {
root html;
}
# common router
location / {
root /usr/local/oushu/littleboy-fe;
try_files $uri $uri/ /index.html;
index index.html;
add_header Cache-Control no-cache;
add_header Access-Control-Allow-Origin *;
add_header Access-Control-Allow-Methods 'GET, POST, OPTIONS';
add_header Access-Control-Allow-Headers 'DNT,X-Mx-ReqToken,Keep-Alive,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Authorization';
}
}
/usr/local/nginx/conf/premise-frontend.conf
配置文件中添加转发规则
# 添加websocket转发地址,在nginx配置文件的最外层
upstream websocket-littleboy {
server 127.0.0.1:1885;
}
# 在lava nginx服务服务中添加转发规则
server {
listen 3000;
server_name localhost;
error_page 404 /404.html;
# ......
# 此处省略其他服务配置
# 以下是需要添加的配置
# littleboy
location ~ ^/api/lava/littleboy(.*) {
proxy_pass http://127.0.0.1:1885/lava/littleboy$1$is_args$args;
proxy_set_header Host $http_host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
location ~ ^/main/littleboy/assets(.*) {
proxy_pass http://127.0.0.1:1895/assets$1$is_args$args;
proxy_set_header Host $http_host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
location ~ ^/ws/lava/littleboy/(.*) {
proxy_pass http://websocket-littleboy/lava/littleboy/$1$is_args$args;
proxy_read_timeout 100s;
proxy_send_timeout 100s;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection $connection_upgrade;
}
location ~ ^/lava/littleboy/notebook/(.*) {
proxy_pass http://websocket-littleboy/lava/littleboy/notebook/$1$is_args$args;
proxy_read_timeout 100s;
proxy_send_timeout 100s;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection $connection_upgrade;
}
# 以上是添加的配置
# 此处省略其他服务配置
# ......
# redirect server error pages to the static page /50x.html
error_page 500 502 503 504 /50x.html;
location = /50x.html {
root html;
}
# common router
location / {
root /usr/local/oushu/premise-ui-common/lava-fe-core/dist;
try_files $uri $uri/ /index.html;
index index.html;
add_header Cache-Control no-cache;
add_header Access-Control-Allow-Origin *;
add_header Access-Control-Allow-Methods 'GET, POST, OPTIONS';
add_header Access-Control-Allow-Headers 'DNT,X-Mx-ReqToken,Keep-Alive,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Authorization';
}
}
配置LittleBoy基础服务#
配置数据库链接/usr/local/oushu/littleboy/conf/postgres.config.xml
<?xml version="1.0" encoding="utf-8"?>
<!--
~ Copyright (c) 2018. Oushu
-->
<postgres>
<host>localhost</host>
<port>4432</port>
<user>oushu</user>
<password>*******</password>
<dbname>littleboy</dbname>
<sslmode>disable</sslmode>
</postgres>
根据实际部署情况修改配置文件/usr/local/oushu/littleboy/conf/lbserver-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<property>
<name>basic.master.hosts</name>
<value>server1,server2,server3</value>
<description>LittleBoy所有server的地址(hostname)</description>
</property>
<property>
<name>basic.master.lava.protocol</name>
<value>https</value>
<description>Lava REST API服务协议类型</description>
</property>
<property>
<name>basic.master.lava.host</name>
<value>localhost</value>
<description>Lava REST API地址(hostname)</description>
</property>
<property>
<name>basic.master.lava.port</name>
<value>443</value>
<description>Lava REST API端口</description>
</property>
<property>
<name>basic.master.lava.rpc.port</name>
<value>8081</value>
<description>Lava RPC端口</description>
</property>
<property>
<name>basic.master.admin</name>
<value>https://deployserver:1651</value>
<description>Lava自动部署REST API地址。可选配置,如果没有配置将通过Lava接口查询自动部署REST API地址,如果配置了,将使用配置的地址</description>
</property>
<property>
<name>basic.master.port</name>
<value>1885</value>
<description>LittleBoy Server REST API端口</description>
</property>
<property>
<name>gossip.port</name>
<value>1888</value>
<description>LittleBoy Server同步端口</description>
</property>
<property>
<name>gossip.seed</name>
<value>server1:1888,server2:1888,server3:1888</value>
<description>LittleBoy Server集群同步成员</description>
</property>
<property>
<name>basic.master.storage</name>
<value>/littleboy</value>
<description>LittleBoy Server HDFS存储路径,用于中间文件,模型文件的存储</description>
</property>
<property>
<name>basic.master.hdfsuser</name>
<value>oushu</value>
<description>LittleBoy Server HDFS 访问用户</description>
</property>
<property>
<name>basic.logDir</name>
<value>/usr/local/oushu/log/littleboy</value>
<description>LittleBoy Server日志文件目录</description>
</property>
<property>
<name>basic.logLevel</name>
<value>info</value>
<description>LittleBoy Server日志级别</description>
</property>
<property>
<name>basic.storage</name>
<value>/data1/littleboy</value>
<description>LittleBoy Server临时文件目录</description>
</property>
</configuration>
修改HDFS客户端配置文件/usr/local/oushu/littleboy/conf/hdfs-client.xml
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<property>
<name>rpc.client.timeout</name>
<value>3600000</value>
</property>
<property>
<name>rpc.client.connect.tcpnodelay</name>
<value>true</value>
</property>
<property>
<name>rpc.client.max.idle</name>
<value>10000</value>
</property>
<property>
<name>rpc.client.ping.interval</name>
<value>10000</value>
</property>
<property>
<name>rpc.client.connect.timeout</name>
<value>600000</value>
</property>
<property>
<name>rpc.client.connect.retry</name>
<value>10</value>
</property>
<property>
<name>rpc.client.read.timeout</name>
<value>3600000</value>
</property>
<property>
<name>rpc.client.write.timeout</name>
<value>3600000</value>
</property>
<property>
<name>rpc.client.socket.linger.timeout</name>
<value>-1</value>
</property>
<property>
<name>dfs.client.read.shortcircuit</name>
<value>true</value>
</property>
<property>
<name>dfs.default.replica</name>
<value>3</value>
</property>
<property>
<name>dfs.prefetchsize</name>
<value>10</value>
</property>
<property>
<name>dfs.client.failover.max.attempts</name>
<value>15</value>
</property>
<property>
<name>dfs.default.blocksize</name>
<value>134217728</value>
</property>
<property>
<name>dfs.client.log.severity</name>
<value>INFO</value>
</property>
<property>
<name>input.connect.timeout</name>
<value>600000</value>
</property>
<property>
<name>input.read.timeout</name>
<value>3600000</value>
</property>
<property>
<name>input.write.timeout</name>
<value>3600000</value>
</property>
<property>
<name>input.localread.default.buffersize</name>
<value>2097152</value>
</property>
<property>
<name>input.localread.blockinfo.cachesize</name>
<value>1000</value>
</property>
<property>
<name>input.read.getblockinfo.retry</name>
<value>3</value>
</property>
<property>
<name>output.replace-datanode-on-failure</name>
<value>false</value>
</property>
<property>
<name>output.default.chunksize</name>
<value>512</value>
</property>
<property>
<name>output.default.packetsize</name>
<value>65536</value>
</property>
<property>
<name>output.default.write.retry</name>
<value>10</value>
</property>
<property>
<name>output.connect.timeout</name>
<value>600000</value>
</property>
<property>
<name>output.read.timeout</name>
<value>3600000</value>
</property>
<property>
<name>output.write.timeout</name>
<value>3600000</value>
</property>
<property>
<name>output.packetpool.size</name>
<value>1024</value>
</property>
<property>
<name>output.close.timeout</name>
<value>900000</value>
</property>
<property>
<name>dfs.domain.socket.path</name>
<value>/var/lib/hadoop-hdfs/dn_socket</value>
</property>
<property>
<name>dfs.client.use.legacy.blockreader.local</name>
<value>false</value>
</property>
<property>
<name>dfs.ha.namenodes.oushu</name>
<value>nn1,nn2</value>
</property>
<property>
<name>dfs.namenode.http-address.oushu.nn1</name>
<value>namenode1:50070</value>
</property>
<property>
<name>dfs.namenode.http-address.oushu.nn2</name>
<value>namenode2:50070</value>
</property>
<property>
<name>dfs.namenode.rpc-address.oushu.nn1</name>
<value>namenode1:9000</value>
</property>
<property>
<name>dfs.namenode.rpc-address.oushu.nn2</name>
<value>namenode2:9000</value>
</property>
<property>
<name>dfs.nameservices</name>
<value>oushu</value>
</property>
</configuration>
在/usr/local/oushu/littleboy/conf/littleboy-env.sh
文件中修改环境变量JAVA_HOME
#!/usr/bin/env bash
this="${BASH_SOURCE-$0}"
export LITTLEBOY_HOME=$(cd -- "$(dirname -- "$this")/.." && pwd -P)
export DEPENDENCE_HOME=/usr/local/oushu/littleboy-dependence
platform=`uname`
# might need config manually START
export LB_DEVICE_TYPE=CPU # need to restart process for predict, but willcome into force immediately in the next train. # Should check multi-devicecluster availability.
export JAVA_HOME=${JAVA_HOME}
# config for hdfs client
export LIBHDFS3_CONF=${LITTLEBOY_HOME}/conf/hdfs-client.xml
# might need config manually END
export JARS_DIR=${LITTLEBOY_HOME}/jars
export PYTHON_BIN=${DEPENDENCE_HOME}/conda3/bin/python3
export LIBHDFS_PATH=${DEPENDENCE_HOME}/lib/libhdfs.so
export ARROW_LIBHDFS_DIR=${DEPENDENCE_HOME}/lib
DEVICE_LIB=""
if [[ "$LB_DEVICE_TYPE" == "GPU" ]];then
DEVICE_LIB="/gpu/lib"
export PATH=${DEPENDENCE_HOME}/conda3/envs/gpu/bin:$PATH
else
export PATH=${DEPENDENCE_HOME}/conda3/bin:$PATH
fi
if [[ "$platform" == "Linux" ]];then
export LD_LIBRARY_PATH=${JAVA_HOME}/jre/lib/amd64/server:${DEPENDENCE_HOME}/lib:${LITTLEBOY_HOME}/lib${DEVICE_LIB}:${LD_LIBRARY_PATH+:$LD_LIBRARY_PATH}
else
export DYLD_LIBRARY_PATH=${JAVA_HOME}/jre/lib/amd64/server:${DEPENDENCE_HOME}/lib:${LITTLEBOY_HOME}/lib${DEVICE_LIB}:${LD_LIBRARY_PATH+:$LD_LIBRARY_PATH}
fi
export CUDA_VISIBLE_DEVICES=0
配置LittleBoy计算集群#
根据实际部署情况修改配置文件/usr/local/oushu/littleboy/conf/lbworker-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<property>
<name>basic.master.hosts</name>
<value>server1:1885,server2:1885,server3:1885</value>
<description>LittleBoy Server地址</description>
</property>
<property>
<name>basic.worker.port</name>
<value>1891</value>
<description>LittleBoy Worker REST API端口</description>
</property>
<property>
<name>basic.enablepmmlserver</name>
<value>true</value>
<description>是否启动pmml模型服务</description>
</property>
<property>
<name>basic.logDir</name>
<value>/usr/local/oushu/log/littleboy/</value>
<description>LittleBoy Worker日志文件目录</description>
</property>
<property>
<name>basic.logLevel</name>
<value>info</value>
<description>LittleBoy Worker日志级别</description>
</property>
<property>
<name>basic.storage</name>
<value>/data1/littleboy</value>
<description>LittleBoy Worker临时文件目录</description>
</property>
</configuration>
启动#
启动Nginx服务#
启动或重新加载Nginx服务
#启动nginx
nginx
#重新加载
nginx -s reload
启动LittleBoy基础服务#
登录oushu1节点
ssh oushu1
su - root
执行以下操作以启动LittleBoy基础服务
lava ssh -f ${HOME}/serverhost -e "sudo -u oushu /usr/local/oushu/littleboy/sbin/littleboy start master"
启动LittleBoy计算集群#
执行以下操作以启动LittleBoy计算集群
lava ssh -f ${HOME}/lbhost -e "sudo -u oushu /usr/local/oushu/littleboy/sbin/littleboy start worker"
检查状态#
ps -ef | grep littleboy
常用命令#
# 停止基础服务
lava ssh -f ${HOME}/serverhost -e "sudo -u oushu /usr/local/oushu/littleboy/sbin/littleboy stop master"
# 停止计算集群
lava ssh -f ${HOME}/lbhost -e "sudo -u oushu /usr/local/oushu/littleboy/sbin/littleboy stop worker"
注册到Skylab(可选)#
在oushu1节点修改lava命令行工具配置中skylab的节点ip
vi /usr/local/oushu/lava/conf/server.json
编写注册request到一个文件,例如~/lbworker-register.json
{
"data": {
"name": "LB-Worker",
"group_roles": [
{
"role": "littleboy.worker",
"cluster_name": "lbworker",
"group_name": "worker1",
// 安装的机器信息,需要在lava-admin元数据中
"machines": [
{
"id": 1,
"name": "hostname1",
"subnet": "lava",
"data_ip": "127.0.0.1",
"manage_ip": "",
"assist_port": 1622,
"ssh_port": 22
}
]
}
],
"config": {
"lbworker-site.xml": [
{
"key": "basic.worker.port",
"value": "1891"
},
// 以下是LittleBoy计算集群依赖的Spark集群配置信息
{
"key": "spark.master.rest.port",
"value": "2881"
},
{
"key": "SPARK_MASTER_HOSTS",
"value": "master1,master2" // LittleBoy计算集群依赖的Spark集群master hosts
},
{
"key": "SPARK_HISTORY_UI_PORT",
"value": "2884"
},
{
"key": "SPARK_MASTER_PORT",
"value": "2882"
},
{
"key": "SPARK_MASTER_WEBUI_PORT",
"value": "2883"
},
{
"key": "SPARK_WORKER_WEBUI_PORT",
"value": "2885"
}
]
}
}
}
上述配置文件中,需要根据实际情况修改machines数组中的机器信息,在平台基础组件lava所安装的机器执行:
psql lavaadmin -p 4432 -U oushu -c "select m.id,m.name,s.name as subnet,m.private_ip as data_ip,m.public_ip as manage_ip,m.assist_port,m.ssh_port from machine as m,subnet as s where m.subnet_id=s.id;"
获取到所需的机器信息,根据服务角色对应的节点,将机器信息添加到machines数组中。
例如oushu1对应littleboy worker节点,那么oushu1的机器信息需要备添加到littleboy.worker角色对应的machines数组中。
调用lava命令注册集群:
lava login -u oushu -p ********
lava onprem-register service -s LBWorker -f ~/lbworker-register.json
如果返回值为:
Add service by self success
则表示注册成功,如果有错误信息,请根据错误信息处理。
同时,从页面登录后,在自动部署模块对应服务中可以查看到新添加的集群,同时列表中会实时监控Spark进程在机器上的状态。