命令行部署#

前提#

LittleBoy前端服务需要依赖Nginx服务

Nginx安装部署请参考:负载均衡 Nginx安装

首先登录到oushu1,然后切换到root用户

ssh oushu1
su - root

创建一个serverhost文件,包含Spark集群中所有的机器

cat > ${HOME}/serverhost << EOF
oushu1
oushu2
EOF

创建一个lbhost文件,包含Spark集群中所有的master机器

cat > ${HOME}/lbhost << EOF
oushu1
oushu2
oushu3
EOF

在oushu1节点配置yum源,安装lava命令行管理工具

# 从yum源所在机器获取repo文件
scp oushu@192.168.1.10:/etc/yum.repos.d/oushu.repo /etc/yum.repos.d/oushu.repo
# 追加yum源所在机器信息到/etc/hosts文件
# 安装lava命令行管理工具
yum clean all
yum makecache
yum install lava

oushu1节点和集群内其他节点交换公钥,以便ssh免密码登陆和分发配置文件。

lava ssh-exkeys -f ${HOME}/lbhost -p ********

分发repo文件到其他机器

lava scp -f ${HOME}/lbhost /etc/yum.repos.d/oushu.repo =:/etc/yum.repos.d

安装#

安装LittleBoy前端服务,只需在oushu1安装即可

yum install littleboy-fe

安装LittleBoy Server/Worker

lava ssh -f ${HOME}/lbhost -e "yum install -y littleboy"

配置#

配置LittleBoy前端服务#

/usr/local/nginx/conf/premise-frontend.conf配置文件中添加前端nginx服务

# littleboy
server {
    listen       1895;
    listen       [::]:1895;
    server_name  _;
    gzip_static on;
    error_page  404              /404.html;
    # redirect server error pages to the static page /50x.html
    error_page   500 502 503 504  /50x.html;
    location = /50x.html {
        root   html;
    }
    # common router
    location / {
        root /usr/local/oushu/littleboy-fe;
        try_files $uri $uri/ /index.html;
        index index.html;
        add_header Cache-Control no-cache;
        add_header Access-Control-Allow-Origin *;
        add_header Access-Control-Allow-Methods 'GET, POST, OPTIONS';
        add_header Access-Control-Allow-Headers 'DNT,X-Mx-ReqToken,Keep-Alive,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Authorization';
    }
}

/usr/local/nginx/conf/premise-frontend.conf配置文件中添加转发规则

# 添加websocket转发地址,在nginx配置文件的最外层
upstream websocket-littleboy {
    server 127.0.0.1:1885;
}

# 在lava nginx服务服务中添加转发规则
server {
    listen       3000;
    server_name  localhost;
        error_page  404              /404.html;
   
    # ...... 
    # 此处省略其他服务配置
    # 以下是需要添加的配置
    # littleboy
    location ~ ^/api/lava/littleboy(.*) {
        proxy_pass http://127.0.0.1:1885/lava/littleboy$1$is_args$args;
        proxy_set_header Host $http_host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
    }
    location ~ ^/main/littleboy/assets(.*) {
        proxy_pass http://127.0.0.1:1895/assets$1$is_args$args;
        proxy_set_header Host $http_host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
    }
    location ~ ^/ws/lava/littleboy/(.*) {
        proxy_pass http://websocket-littleboy/lava/littleboy/$1$is_args$args;
        proxy_read_timeout 100s;
        proxy_send_timeout 100s;
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_http_version 1.1;
        proxy_set_header Upgrade $http_upgrade;
        proxy_set_header Connection $connection_upgrade;
    }
    location ~ ^/lava/littleboy/notebook/(.*) {
        proxy_pass http://websocket-littleboy/lava/littleboy/notebook/$1$is_args$args;
        proxy_read_timeout 100s;
        proxy_send_timeout 100s;
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_http_version 1.1;
        proxy_set_header Upgrade $http_upgrade;
        proxy_set_header Connection $connection_upgrade;
    }
    # 以上是添加的配置
    # 此处省略其他服务配置
    # ...... 
   
    # redirect server error pages to the static page /50x.html
    error_page   500 502 503 504  /50x.html;
    location = /50x.html {
        root   html;
    }
    # common router
    location / {
        root /usr/local/oushu/premise-ui-common/lava-fe-core/dist;
        try_files $uri $uri/ /index.html;
        index index.html;
        add_header Cache-Control no-cache;
        add_header Access-Control-Allow-Origin *;
        add_header Access-Control-Allow-Methods 'GET, POST, OPTIONS';
        add_header Access-Control-Allow-Headers 'DNT,X-Mx-ReqToken,Keep-Alive,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Authorization';
    }
}

配置LittleBoy基础服务#

配置数据库链接/usr/local/oushu/littleboy/conf/postgres.config.xml

<?xml version="1.0" encoding="utf-8"?>
<!--
~ Copyright (c) 2018. Oushu
-->
<postgres>
    <host>localhost</host>
    <port>4432</port>
    <user>oushu</user>
    <password>*******</password>
    <dbname>littleboy</dbname>
    <sslmode>disable</sslmode>
</postgres>

根据实际部署情况修改配置文件/usr/local/oushu/littleboy/conf/lbserver-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<configuration>
    <property>
        <name>basic.master.hosts</name>
        <value>server1,server2,server3</value>
        <description>LittleBoy所有server的地址(hostname)</description>
    </property>
    <property>
        <name>basic.master.lava.protocol</name>
        <value>https</value>
        <description>Lava REST API服务协议类型</description>
    </property>
    <property>
        <name>basic.master.lava.host</name>
        <value>localhost</value>
        <description>Lava REST API地址(hostname)</description>
    </property>
    <property>
        <name>basic.master.lava.port</name>
        <value>443</value>
        <description>Lava REST API端口</description>
    </property>
    <property>
        <name>basic.master.lava.rpc.port</name>
        <value>8081</value>
        <description>Lava RPC端口</description>
    </property>
    <property>
        <name>basic.master.admin</name>
        <value>https://deployserver:1651</value>
        <description>Lava自动部署REST API地址。可选配置,如果没有配置将通过Lava接口查询自动部署REST API地址,如果配置了,将使用配置的地址</description>
    </property>
    <property>
        <name>basic.master.port</name>
        <value>1885</value>
        <description>LittleBoy Server REST API端口</description>
    </property>
    <property>
        <name>gossip.port</name>
        <value>1888</value>
        <description>LittleBoy Server同步端口</description>
    </property>
    <property>
        <name>gossip.seed</name>
        <value>server1:1888,server2:1888,server3:1888</value>
        <description>LittleBoy Server集群同步成员</description>
    </property>
    <property>
        <name>basic.master.storage</name>
        <value>/littleboy</value>
        <description>LittleBoy Server HDFS存储路径,用于中间文件,模型文件的存储</description>
    </property>
    <property>
        <name>basic.master.hdfsuser</name>
        <value>oushu</value>
        <description>LittleBoy Server HDFS 访问用户</description>
    </property>
    <property>
        <name>basic.logDir</name>
        <value>/usr/local/oushu/log/littleboy</value>
        <description>LittleBoy Server日志文件目录</description>
    </property>
    <property>
        <name>basic.logLevel</name>
        <value>info</value>
        <description>LittleBoy Server日志级别</description>
    </property>
    <property>
        <name>basic.storage</name>
        <value>/data1/littleboy</value>
        <description>LittleBoy Server临时文件目录</description>
    </property>
</configuration>

修改HDFS客户端配置文件/usr/local/oushu/littleboy/conf/hdfs-client.xml

<?xml version="1.0" encoding="UTF-8"?>
<configuration>
    <property>
        <name>rpc.client.timeout</name>
        <value>3600000</value>
    </property>
    <property>
        <name>rpc.client.connect.tcpnodelay</name>
        <value>true</value>
    </property>
    <property>
        <name>rpc.client.max.idle</name>
        <value>10000</value>
    </property>
    <property>
        <name>rpc.client.ping.interval</name>
        <value>10000</value>
    </property>
    <property>
        <name>rpc.client.connect.timeout</name>
        <value>600000</value>
    </property>
    <property>
        <name>rpc.client.connect.retry</name>
        <value>10</value>
    </property>
    <property>
        <name>rpc.client.read.timeout</name>
        <value>3600000</value>
    </property>
    <property>
        <name>rpc.client.write.timeout</name>
        <value>3600000</value>
    </property>
    <property>
        <name>rpc.client.socket.linger.timeout</name>
        <value>-1</value>
    </property>
    <property>
        <name>dfs.client.read.shortcircuit</name>
        <value>true</value>
    </property>
    <property>
        <name>dfs.default.replica</name>
        <value>3</value>
    </property>
    <property>
        <name>dfs.prefetchsize</name>
        <value>10</value>
    </property>
    <property>
        <name>dfs.client.failover.max.attempts</name>
        <value>15</value>
    </property>
    <property>
        <name>dfs.default.blocksize</name>
        <value>134217728</value>
    </property>
    <property>
        <name>dfs.client.log.severity</name>
        <value>INFO</value>
    </property>
    <property>
        <name>input.connect.timeout</name>
        <value>600000</value>
    </property>
    <property>
        <name>input.read.timeout</name>
        <value>3600000</value>
    </property>
    <property>
        <name>input.write.timeout</name>
        <value>3600000</value>
    </property>
    <property>
        <name>input.localread.default.buffersize</name>
        <value>2097152</value>
    </property>
    <property>
        <name>input.localread.blockinfo.cachesize</name>
        <value>1000</value>
    </property>
    <property>
        <name>input.read.getblockinfo.retry</name>
        <value>3</value>
    </property>
    <property>
        <name>output.replace-datanode-on-failure</name>
        <value>false</value>
    </property>
    <property>
        <name>output.default.chunksize</name>
        <value>512</value>
    </property>
    <property>
        <name>output.default.packetsize</name>
        <value>65536</value>
    </property>
    <property>
        <name>output.default.write.retry</name>
        <value>10</value>
    </property>
    <property>
        <name>output.connect.timeout</name>
        <value>600000</value>
    </property>
    <property>
        <name>output.read.timeout</name>
        <value>3600000</value>
    </property>
    <property>
        <name>output.write.timeout</name>
        <value>3600000</value>
    </property>
    <property>
        <name>output.packetpool.size</name>
        <value>1024</value>
    </property>
    <property>
        <name>output.close.timeout</name>
        <value>900000</value>
    </property>
    <property>
        <name>dfs.domain.socket.path</name>
        <value>/var/lib/hadoop-hdfs/dn_socket</value>
    </property>
    <property>
        <name>dfs.client.use.legacy.blockreader.local</name>
        <value>false</value>
    </property>
    <property>
        <name>dfs.ha.namenodes.oushu</name>
        <value>nn1,nn2</value>
    </property>
    <property>
        <name>dfs.namenode.http-address.oushu.nn1</name>
        <value>namenode1:50070</value>
    </property>
    <property>
        <name>dfs.namenode.http-address.oushu.nn2</name>
        <value>namenode2:50070</value>
    </property>
    <property>
        <name>dfs.namenode.rpc-address.oushu.nn1</name>
        <value>namenode1:9000</value>
    </property>
    <property>
        <name>dfs.namenode.rpc-address.oushu.nn2</name>
        <value>namenode2:9000</value>
    </property>
    <property>
        <name>dfs.nameservices</name>
        <value>oushu</value>
    </property>
</configuration>

/usr/local/oushu/littleboy/conf/littleboy-env.sh文件中修改环境变量JAVA_HOME

#!/usr/bin/env bash
this="${BASH_SOURCE-$0}"
export LITTLEBOY_HOME=$(cd -- "$(dirname -- "$this")/.." && pwd -P)
export DEPENDENCE_HOME=/usr/local/oushu/littleboy-dependence
platform=`uname`
# might need config manually START
export LB_DEVICE_TYPE=CPU   # need to restart  process for predict, but willcome into force immediately in the next train. # Should check multi-devicecluster availability.
export JAVA_HOME=${JAVA_HOME}
# config for hdfs client
export LIBHDFS3_CONF=${LITTLEBOY_HOME}/conf/hdfs-client.xml
# might need config manually END
export JARS_DIR=${LITTLEBOY_HOME}/jars
export PYTHON_BIN=${DEPENDENCE_HOME}/conda3/bin/python3
export LIBHDFS_PATH=${DEPENDENCE_HOME}/lib/libhdfs.so
export ARROW_LIBHDFS_DIR=${DEPENDENCE_HOME}/lib
DEVICE_LIB=""
if [[ "$LB_DEVICE_TYPE" == "GPU" ]];then
    DEVICE_LIB="/gpu/lib"
    export PATH=${DEPENDENCE_HOME}/conda3/envs/gpu/bin:$PATH
else
    export PATH=${DEPENDENCE_HOME}/conda3/bin:$PATH
fi
if [[ "$platform" == "Linux" ]];then
    export LD_LIBRARY_PATH=${JAVA_HOME}/jre/lib/amd64/server:${DEPENDENCE_HOME}/lib:${LITTLEBOY_HOME}/lib${DEVICE_LIB}:${LD_LIBRARY_PATH+:$LD_LIBRARY_PATH}
else
    export DYLD_LIBRARY_PATH=${JAVA_HOME}/jre/lib/amd64/server:${DEPENDENCE_HOME}/lib:${LITTLEBOY_HOME}/lib${DEVICE_LIB}:${LD_LIBRARY_PATH+:$LD_LIBRARY_PATH}
fi
export CUDA_VISIBLE_DEVICES=0

配置LittleBoy计算集群#

根据实际部署情况修改配置文件/usr/local/oushu/littleboy/conf/lbworker-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<configuration>
    <property>
        <name>basic.master.hosts</name>
        <value>server1:1885,server2:1885,server3:1885</value>
        <description>LittleBoy Server地址</description>
    </property>
    <property>
        <name>basic.worker.port</name>
        <value>1891</value>
        <description>LittleBoy Worker REST API端口</description>
    </property>
    <property>
        <name>basic.enablepmmlserver</name>
        <value>true</value>
        <description>是否启动pmml模型服务</description>
    </property>
    <property>
        <name>basic.logDir</name>
        <value>/usr/local/oushu/log/littleboy/</value>
        <description>LittleBoy Worker日志文件目录</description>
    </property>
    <property>
        <name>basic.logLevel</name>
        <value>info</value>
        <description>LittleBoy Worker日志级别</description>
    </property>
    <property>
        <name>basic.storage</name>
        <value>/data1/littleboy</value>
        <description>LittleBoy Worker临时文件目录</description>
    </property>
</configuration>

启动#

启动Nginx服务#

启动或重新加载Nginx服务

#启动nginx
nginx
#重新加载
nginx -s reload

启动LittleBoy基础服务#

登录oushu1节点

ssh oushu1
su - root

执行以下操作以启动LittleBoy基础服务

lava ssh -f ${HOME}/serverhost -e "sudo -u oushu /usr/local/oushu/littleboy/sbin/littleboy start master" 

启动LittleBoy计算集群#

执行以下操作以启动LittleBoy计算集群

lava ssh -f ${HOME}/lbhost -e "sudo -u oushu /usr/local/oushu/littleboy/sbin/littleboy start worker" 

检查状态#

ps -ef | grep littleboy

常用命令#

# 停止基础服务
lava ssh -f ${HOME}/serverhost -e "sudo -u oushu /usr/local/oushu/littleboy/sbin/littleboy stop master" 
# 停止计算集群
lava ssh -f ${HOME}/lbhost -e "sudo -u oushu /usr/local/oushu/littleboy/sbin/littleboy stop worker" 

注册到Skylab(可选)#

在oushu1节点修改lava命令行工具配置中skylab的节点ip

vi /usr/local/oushu/lava/conf/server.json

编写注册request到一个文件,例如~/lbworker-register.json

{
    "data": {
        "name": "LB-Worker",
        "group_roles": [
            {
                "role": "littleboy.worker",
                "cluster_name": "lbworker",
                "group_name": "worker1",
                // 安装的机器信息,需要在lava-admin元数据中
                "machines": [
                    {
                        "id": 1,
                        "name": "hostname1",
                        "subnet": "lava",
                        "data_ip": "127.0.0.1",
                        "manage_ip": "",
                        "assist_port": 1622,
                        "ssh_port": 22
                    }
                ]
            }
        ],
        "config": {
            "lbworker-site.xml": [
                {
                    "key": "basic.worker.port",
                    "value": "1891"
                },
                // 以下是LittleBoy计算集群依赖的Spark集群配置信息
                {
                    "key": "spark.master.rest.port",
                    "value": "2881"
                },
                {
                    "key": "SPARK_MASTER_HOSTS",
                    "value": "master1,master2" // LittleBoy计算集群依赖的Spark集群master hosts
                },
                {
                    "key": "SPARK_HISTORY_UI_PORT",
                    "value": "2884"
                },
                {
                    "key": "SPARK_MASTER_PORT",
                    "value": "2882"
                },
                {
                    "key": "SPARK_MASTER_WEBUI_PORT",
                    "value": "2883"
                },
                {
                    "key": "SPARK_WORKER_WEBUI_PORT",
                    "value": "2885"
                }
            ]
        }
    }
}

上述配置文件中,需要根据实际情况修改machines数组中的机器信息,在平台基础组件lava所安装的机器执行:

psql lavaadmin -p 4432 -U oushu -c "select m.id,m.name,s.name as subnet,m.private_ip as data_ip,m.public_ip as manage_ip,m.assist_port,m.ssh_port from machine as m,subnet as s where m.subnet_id=s.id;"

获取到所需的机器信息,根据服务角色对应的节点,将机器信息添加到machines数组中。

例如oushu1对应littleboy worker节点,那么oushu1的机器信息需要备添加到littleboy.worker角色对应的machines数组中。

调用lava命令注册集群:

lava login -u oushu -p ********
lava onprem-register service -s LBWorker -f ~/lbworker-register.json

如果返回值为:

Add service by self success

则表示注册成功,如果有错误信息,请根据错误信息处理。

同时,从页面登录后,在自动部署模块对应服务中可以查看到新添加的集群,同时列表中会实时监控Spark进程在机器上的状态。