WangHan
2024-09-12 d5855a4926926698b740bc6c7ba489de47adb68b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
package tech.powerjob.server.remote.worker;
 
import tech.powerjob.common.model.DeployedContainerInfo;
import tech.powerjob.common.request.WorkerHeartbeat;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import lombok.extern.slf4j.Slf4j;
import org.springframework.util.CollectionUtils;
import tech.powerjob.server.common.module.WorkerInfo;
 
import java.util.Collections;
import java.util.List;
import java.util.Map;
 
/**
 * 管理Worker集群状态
 *
 * @author tjq
 * @since 2020/4/5
 */
@Slf4j
public class ClusterStatusHolder {
 
    /**
     * 集群所属的应用名称
     */
    private final String appName;
    /**
     * 集群中所有机器的信息
     */
    private final Map<String, WorkerInfo> address2WorkerInfo;
    /**
     * 集群中所有机器的容器部署状态 containerId -> (workerAddress -> containerInfo)
     */
    private Map<Long, Map<String, DeployedContainerInfo>> containerId2Infos;
 
 
    public ClusterStatusHolder(String appName) {
        this.appName = appName;
        address2WorkerInfo = Maps.newConcurrentMap();
        containerId2Infos = Maps.newConcurrentMap();
    }
 
    /**
     * 更新 worker 机器的状态
     * @param heartbeat 心跳请求
     */
    public void updateStatus(WorkerHeartbeat heartbeat) {
 
        String workerAddress = heartbeat.getWorkerAddress();
        long heartbeatTime = heartbeat.getHeartbeatTime();
 
        WorkerInfo workerInfo = address2WorkerInfo.computeIfAbsent(workerAddress, ignore -> {
            WorkerInfo wf = new WorkerInfo();
            wf.refresh(heartbeat);
            return wf;
        });
        long oldTime = workerInfo.getLastActiveTime();
        if (heartbeatTime < oldTime) {
            log.warn("[ClusterStatusHolder-{}] receive the expired heartbeat from {}, serverTime: {}, heartTime: {}", appName, heartbeat.getWorkerAddress(), System.currentTimeMillis(), heartbeat.getHeartbeatTime());
            return;
        }
 
        workerInfo.refresh(heartbeat);
 
        List<DeployedContainerInfo> containerInfos = heartbeat.getContainerInfos();
        if (!CollectionUtils.isEmpty(containerInfos)) {
            containerInfos.forEach(containerInfo -> {
                Map<String, DeployedContainerInfo> infos = containerId2Infos.computeIfAbsent(containerInfo.getContainerId(), ignore -> Maps.newConcurrentMap());
                infos.put(workerAddress, containerInfo);
            });
        }
    }
 
    /**
     * 获取该集群所有的机器信息
     * @return 地址: 机器信息
     */
    public Map<String, WorkerInfo> getAllWorkers() {
        return address2WorkerInfo;
    }
 
    /**
     * 获取当前该Worker集群容器的部署情况
     * @param containerId 容器ID
     * @return 该容器的部署情况
     */
    public List<DeployedContainerInfo> getDeployedContainerInfos(Long containerId) {
        List<DeployedContainerInfo> res = Lists.newLinkedList();
        containerId2Infos.getOrDefault(containerId, Collections.emptyMap()).forEach((address, info) -> {
            info.setWorkerAddress(address);
            res.add(info);
        });
        return res;
    }
 
    /**
     * 释放所有本地存储的容器信息(该操作会导致短暂的 listDeployedContainer 服务不可用)
     */
    public void release() {
        log.info("[ClusterStatusHolder-{}] clean the containerInfos, listDeployedContainer service may down about 1min~", appName);
        // 丢弃原来的所有数据,准备重建
        containerId2Infos = Maps.newConcurrentMap();
 
        // 丢弃超时机器的信息
        List<String> timeoutAddress = Lists.newLinkedList();
        address2WorkerInfo.forEach((addr, workerInfo) -> {
            if (workerInfo.timeout()) {
                timeoutAddress.add(addr);
            }
        });
 
        if (!timeoutAddress.isEmpty()) {
            log.info("[ClusterStatusHolder-{}] detective timeout workers({}), try to release their infos.", appName, timeoutAddress);
            timeoutAddress.forEach(address2WorkerInfo::remove);
        }
    }
}