Skip to content

Commit 3b108b9

Browse files
Support for Management Server Maintenance Mode (#9854)
* Support for Management Server Maintenance - New APIs: prepareForMaintenance and cancelMaintenance, with required parameter - managementserverid. - New management server states for maintenance: PreparingForMaintenance, Maintenance. - listHosts API with optional parameter – managementserverid, to list the hosts connected to the management server. - Support management server maintenance when more than one active management servers available. - Triggers transfer agents to other available management servers for maintenance, new agent command MigrateAgentConnectionCommand to initiate transfer of indirect agents. - New global config 'management.server.maintenance.timeout', to set the timeout (in mins) for the management server maintenance window, default: 60 mins. - UI changes: Prepare and Cancel Maintenance in Management Server section, Connected Agents tab, New fields for hosts and management servers. * Updated pending jobs check timer task with ScheduledExecutorService * keep maintenance state on trigger shutdown call when ms is in maintenance * add pending jobs count to ms response * during ms heartbeat, update state to up only when it's down * allow vm work jobs of async job created before prepare for maintenance * Revert "keep maintenance state on trigger shutdown call when ms is in maintenance" This reverts commit 607e133. * skip maintenance test when multiple management servers are not available, and not configured in host setting for kvm
1 parent 048649d commit 3b108b9

File tree

105 files changed

+2672
-713
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

105 files changed

+2672
-713
lines changed

agent/src/main/java/com/cloud/agent/Agent.java

+84-14
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import java.nio.channels.ClosedChannelException;
2828
import java.nio.charset.Charset;
2929
import java.util.ArrayList;
30+
import java.util.Arrays;
3031
import java.util.HashMap;
3132
import java.util.List;
3233
import java.util.Map;
@@ -40,6 +41,8 @@
4041

4142
import javax.naming.ConfigurationException;
4243

44+
import com.cloud.agent.api.MigrateAgentConnectionAnswer;
45+
import com.cloud.agent.api.MigrateAgentConnectionCommand;
4346
import com.cloud.resource.AgentStatusUpdater;
4447
import com.cloud.resource.ResourceStatusUpdater;
4548
import com.cloud.agent.api.PingAnswer;
@@ -313,7 +316,6 @@ public void start() {
313316
}
314317
_shell.updateConnectedHost();
315318
scavengeOldAgentObjects();
316-
317319
}
318320

319321
public void stop(final String reason, final String detail) {
@@ -477,13 +479,18 @@ public synchronized void lockStartupTask(final Link link) {
477479
}
478480

479481
public void sendStartup(final Link link) {
482+
sendStartup(link, false);
483+
}
484+
485+
public void sendStartup(final Link link, boolean transfer) {
480486
final StartupCommand[] startup = _resource.initialize();
481487
if (startup != null) {
482488
final String msHostList = _shell.getPersistentProperty(null, "host");
483489
final Command[] commands = new Command[startup.length];
484490
for (int i = 0; i < startup.length; i++) {
485491
setupStartupCommand(startup[i]);
486492
startup[i].setMSHostList(msHostList);
493+
startup[i].setConnectionTransferred(transfer);
487494
commands[i] = startup[i];
488495
}
489496
final Request request = new Request(_id != null ? _id : -1, -1, commands, false, false);
@@ -541,9 +548,14 @@ public Task create(final Task.Type type, final Link link, final byte[] data) {
541548
}
542549

543550
protected void reconnect(final Link link) {
544-
if (!_reconnectAllowed) {
551+
reconnect(link, null, null, false);
552+
}
553+
554+
protected void reconnect(final Link link, String preferredHost, List<String> avoidHostList, boolean forTransfer) {
555+
if (!(forTransfer || _reconnectAllowed)) {
545556
return;
546557
}
558+
547559
synchronized (this) {
548560
if (_startup != null) {
549561
_startup.cancel();
@@ -575,22 +587,29 @@ protected void reconnect(final Link link) {
575587
_shell.getBackoffAlgorithm().waitBeforeRetry();
576588
}
577589

590+
String host = preferredHost;
591+
if (StringUtils.isEmpty(host)) {
592+
host = _shell.getNextHost();
593+
}
594+
578595
do {
579-
final String host = _shell.getNextHost();
580-
_connection = new NioClient("Agent", host, _shell.getPort(), _shell.getWorkers(), this);
581-
logger.info("Reconnecting to host:{}", host);
582-
try {
583-
_connection.start();
584-
} catch (final NioConnectionException e) {
585-
logger.info("Attempted to re-connect to the server, but received an unexpected exception, trying again...", e);
586-
_connection.stop();
596+
if (CollectionUtils.isEmpty(avoidHostList) || !avoidHostList.contains(host)) {
597+
_connection = new NioClient("Agent", host, _shell.getPort(), _shell.getWorkers(), this);
598+
logger.info("Reconnecting to host:{}", host);
587599
try {
588-
_connection.cleanUp();
589-
} catch (final IOException ex) {
590-
logger.warn("Fail to clean up old connection. {}", ex);
600+
_connection.start();
601+
} catch (final NioConnectionException e) {
602+
logger.info("Attempted to re-connect to the server, but received an unexpected exception, trying again...", e);
603+
_connection.stop();
604+
try {
605+
_connection.cleanUp();
606+
} catch (final IOException ex) {
607+
logger.warn("Fail to clean up old connection. {}", ex);
608+
}
591609
}
592610
}
593611
_shell.getBackoffAlgorithm().waitBeforeRetry();
612+
host = _shell.getNextHost();
594613
} while (!_connection.isStartup());
595614
_shell.updateConnectedHost();
596615
logger.info("Connected to the host: {}", _shell.getConnectedHost());
@@ -703,6 +722,8 @@ protected void processRequest(final Request request, final Link link) {
703722
}
704723
} else if (cmd instanceof SetupMSListCommand) {
705724
answer = setupManagementServerList((SetupMSListCommand) cmd);
725+
} else if (cmd instanceof MigrateAgentConnectionCommand) {
726+
answer = migrateAgentToOtherMS((MigrateAgentConnectionCommand) cmd);
706727
} else {
707728
if (cmd instanceof ReadyCommand) {
708729
processReadyCommand(cmd);
@@ -858,6 +879,53 @@ private Answer setupManagementServerList(final SetupMSListCommand cmd) {
858879
return new SetupMSListAnswer(true);
859880
}
860881

882+
private Answer migrateAgentToOtherMS(final MigrateAgentConnectionCommand cmd) {
883+
try {
884+
if (CollectionUtils.isNotEmpty(cmd.getMsList())) {
885+
processManagementServerList(cmd.getMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval());
886+
}
887+
migrateAgentConnection(cmd.getAvoidMsList());
888+
} catch (Exception e) {
889+
String errMsg = "Migrate agent connection failed, due to " + e.getMessage();
890+
logger.debug(errMsg, e);
891+
return new MigrateAgentConnectionAnswer(errMsg);
892+
}
893+
return new MigrateAgentConnectionAnswer(true);
894+
}
895+
896+
private void migrateAgentConnection(List<String> avoidMsList) {
897+
final String[] msHosts = _shell.getHosts();
898+
if (msHosts == null || msHosts.length < 1) {
899+
throw new CloudRuntimeException("Management Server hosts empty, not properly configured in agent");
900+
}
901+
902+
List<String> msHostsList = new ArrayList<>(Arrays.asList(msHosts));
903+
msHostsList.removeAll(avoidMsList);
904+
if (msHostsList.isEmpty() || StringUtils.isEmpty(msHostsList.get(0))) {
905+
throw new CloudRuntimeException("No other Management Server hosts to migrate");
906+
}
907+
908+
String preferredHost = null;
909+
for (String msHost : msHostsList) {
910+
try (final Socket socket = new Socket()) {
911+
socket.connect(new InetSocketAddress(msHost, _shell.getPort()), 5000);
912+
preferredHost = msHost;
913+
break;
914+
} catch (final IOException e) {
915+
throw new CloudRuntimeException("Management server host: " + msHost + " is not reachable, to migrate connection");
916+
}
917+
}
918+
919+
if (preferredHost == null) {
920+
throw new CloudRuntimeException("Management server host(s) are not reachable, to migrate connection");
921+
}
922+
923+
logger.debug("Management server host " + preferredHost + " is found to be reachable, trying to reconnect");
924+
_shell.resetHostCounter();
925+
_shell.setConnectionTransfer(true);
926+
reconnect(_link, preferredHost, avoidMsList, true);
927+
}
928+
861929
public void processResponse(final Response response, final Link link) {
862930
final Answer answer = response.getAnswer();
863931
logger.debug("Received response: {}", response.toString());
@@ -1153,7 +1221,8 @@ public void doTask(final Task task) throws TaskExecutionException {
11531221
if (task.getType() == Task.Type.CONNECT) {
11541222
_shell.getBackoffAlgorithm().reset();
11551223
setLink(task.getLink());
1156-
sendStartup(task.getLink());
1224+
sendStartup(task.getLink(), _shell.isConnectionTransfer());
1225+
_shell.setConnectionTransfer(false);
11571226
} else if (task.getType() == Task.Type.DATA) {
11581227
Request request;
11591228
try {
@@ -1178,6 +1247,7 @@ public void doTask(final Task task) throws TaskExecutionException {
11781247
Thread.sleep(5000);
11791248
} catch (InterruptedException e) {
11801249
}
1250+
_shell.setConnectionTransfer(false);
11811251
reconnect(task.getLink());
11821252
return;
11831253
} else if (task.getType() == Task.Type.OTHER) {

agent/src/main/java/com/cloud/agent/AgentShell.java

+9
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ public class AgentShell implements IAgentShell, Daemon {
7777
private String hostToConnect;
7878
private String connectedHost;
7979
private Long preferredHostCheckInterval;
80+
private boolean connectionTransfer = false;
8081
protected AgentProperties agentProperties = new AgentProperties();
8182

8283
public AgentShell() {
@@ -215,6 +216,14 @@ public void setPersistentProperty(String prefix, String name, String value) {
215216
_storage.persist(name, value);
216217
}
217218

219+
public boolean isConnectionTransfer() {
220+
return connectionTransfer;
221+
}
222+
223+
public void setConnectionTransfer(boolean connectionTransfer) {
224+
this.connectionTransfer = connectionTransfer;
225+
}
226+
218227
void loadProperties() throws ConfigurationException {
219228
final File file = PropertiesUtil.findConfigFile("agent.properties");
220229

agent/src/main/java/com/cloud/agent/IAgentShell.java

+4
Original file line numberDiff line numberDiff line change
@@ -70,4 +70,8 @@ public interface IAgentShell {
7070
String getConnectedHost();
7171

7272
void launchNewAgent(ServerResource resource) throws ConfigurationException;
73+
74+
boolean isConnectionTransfer();
75+
76+
void setConnectionTransfer(boolean connectionTransfer);
7377
}

api/src/main/java/com/cloud/host/Host.java

+2
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,8 @@ public static String[] toStrings(Host.Type... types) {
177177
*/
178178
Long getManagementServerId();
179179

180+
Long getLastManagementServerId();
181+
180182
/*
181183
*@return removal date
182184
*/

api/src/main/java/com/cloud/host/Status.java

+1
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ public static String[] toStrings(Status... states) {
127127
s_fsm.addTransition(Status.Connecting, Event.HostDown, Status.Down);
128128
s_fsm.addTransition(Status.Connecting, Event.Ping, Status.Connecting);
129129
s_fsm.addTransition(Status.Connecting, Event.ManagementServerDown, Status.Disconnected);
130+
s_fsm.addTransition(Status.Connecting, Event.StartAgentRebalance, Status.Rebalancing);
130131
s_fsm.addTransition(Status.Connecting, Event.AgentDisconnected, Status.Alert);
131132
s_fsm.addTransition(Status.Up, Event.PingTimeout, Status.Alert);
132133
s_fsm.addTransition(Status.Up, Event.AgentDisconnected, Status.Alert);

api/src/main/java/com/cloud/resource/ResourceService.java

+4-4
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@
2323
import org.apache.cloudstack.api.command.admin.cluster.UpdateClusterCmd;
2424
import org.apache.cloudstack.api.command.admin.host.AddHostCmd;
2525
import org.apache.cloudstack.api.command.admin.host.AddSecondaryStorageCmd;
26-
import org.apache.cloudstack.api.command.admin.host.CancelMaintenanceCmd;
26+
import org.apache.cloudstack.api.command.admin.host.CancelHostMaintenanceCmd;
2727
import org.apache.cloudstack.api.command.admin.host.ReconnectHostCmd;
2828
import org.apache.cloudstack.api.command.admin.host.UpdateHostCmd;
2929
import org.apache.cloudstack.api.command.admin.host.UpdateHostPasswordCmd;
30-
import org.apache.cloudstack.api.command.admin.host.PrepareForMaintenanceCmd;
30+
import org.apache.cloudstack.api.command.admin.host.PrepareForHostMaintenanceCmd;
3131
import org.apache.cloudstack.api.command.admin.host.DeclareHostAsDegradedCmd;
3232
import org.apache.cloudstack.api.command.admin.host.CancelHostAsDegradedCmd;
3333

@@ -51,7 +51,7 @@ public interface ResourceService {
5151

5252
Host autoUpdateHostAllocationState(Long hostId, ResourceState.Event resourceEvent) throws NoTransitionException;
5353

54-
Host cancelMaintenance(CancelMaintenanceCmd cmd);
54+
Host cancelMaintenance(CancelHostMaintenanceCmd cmd);
5555

5656
Host reconnectHost(ReconnectHostCmd cmd) throws AgentUnavailableException;
5757

@@ -69,7 +69,7 @@ public interface ResourceService {
6969

7070
List<? extends Host> discoverHosts(AddSecondaryStorageCmd cmd) throws IllegalArgumentException, DiscoveryException, InvalidParameterValueException;
7171

72-
Host maintain(PrepareForMaintenanceCmd cmd);
72+
Host maintain(PrepareForHostMaintenanceCmd cmd);
7373

7474
Host declareHostAsDegraded(DeclareHostAsDegradedCmd cmd) throws NoTransitionException;
7575

api/src/main/java/com/cloud/server/ManagementServerHostStats.java

+5
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package com.cloud.server;
2020

2121
import java.util.Date;
22+
import java.util.List;
2223

2324
/**
2425
* management server related stats
@@ -70,6 +71,10 @@ public interface ManagementServerHostStats {
7071

7172
String getOsDistribution();
7273

74+
List<String> getLastAgents();
75+
76+
List<String> getAgents();
77+
7378
int getAgentCount();
7479

7580
long getHeapMemoryUsed();

api/src/main/java/org/apache/cloudstack/api/ApiConstants.java

+4-1
Original file line numberDiff line numberDiff line change
@@ -1136,9 +1136,12 @@ public class ApiConstants {
11361136
public static final String LOGOUT = "logout";
11371137
public static final String LIST_IDPS = "listIdps";
11381138

1139-
public static final String READY_FOR_SHUTDOWN = "readyforshutdown";
1139+
public static final String MAINTENANCE_INITIATED = "maintenanceinitiated";
11401140
public static final String SHUTDOWN_TRIGGERED = "shutdowntriggered";
1141+
public static final String READY_FOR_SHUTDOWN = "readyforshutdown";
11411142
public static final String PENDING_JOBS_COUNT = "pendingjobscount";
1143+
public static final String AGENTS_COUNT = "agentscount";
1144+
public static final String AGENTS = "agents";
11421145

11431146
public static final String PUBLIC_MTU = "publicmtu";
11441147
public static final String PRIVATE_MTU = "privatemtu";

api/src/main/java/org/apache/cloudstack/api/command/admin/host/CancelMaintenanceCmd.java api/src/main/java/org/apache/cloudstack/api/command/admin/host/CancelHostMaintenanceCmd.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333

3434
@APICommand(name = "cancelHostMaintenance", description = "Cancels host maintenance.", responseObject = HostResponse.class,
3535
requestHasSensitiveInfo = false, responseHasSensitiveInfo = false)
36-
public class CancelMaintenanceCmd extends BaseAsyncCmd {
36+
public class CancelHostMaintenanceCmd extends BaseAsyncCmd {
3737

3838

3939
/////////////////////////////////////////////////////

api/src/main/java/org/apache/cloudstack/api/command/admin/host/ListHostsCmd.java

+8
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import org.apache.cloudstack.api.response.ClusterResponse;
3232
import org.apache.cloudstack.api.response.HostResponse;
3333
import org.apache.cloudstack.api.response.ListResponse;
34+
import org.apache.cloudstack.api.response.ManagementServerResponse;
3435
import org.apache.cloudstack.api.response.PodResponse;
3536
import org.apache.cloudstack.api.response.UserVmResponse;
3637
import org.apache.cloudstack.api.response.ZoneResponse;
@@ -105,6 +106,9 @@ public class ListHostsCmd extends BaseListCmd {
105106
@Parameter(name = ApiConstants.HYPERVISOR, type = CommandType.STRING, description = "hypervisor type of host: XenServer,KVM,VMware,Hyperv,BareMetal,Simulator")
106107
private String hypervisor;
107108

109+
@Parameter(name = ApiConstants.MANAGEMENT_SERVER_ID, type = CommandType.UUID, entityType = ManagementServerResponse.class, description = "the id of the management server", since="4.21.0")
110+
private Long managementServerId;
111+
108112
/////////////////////////////////////////////////////
109113
/////////////////// Accessors ///////////////////////
110114
/////////////////////////////////////////////////////
@@ -189,6 +193,10 @@ public String getHostOutOfBandManagementPowerState() {
189193
return outOfBandManagementPowerState;
190194
}
191195

196+
public Long getManagementServerId() {
197+
return managementServerId;
198+
}
199+
192200
/////////////////////////////////////////////////////
193201
/////////////// API Implementation///////////////////
194202
/////////////////////////////////////////////////////

api/src/main/java/org/apache/cloudstack/api/command/admin/host/PrepareForMaintenanceCmd.java api/src/main/java/org/apache/cloudstack/api/command/admin/host/PrepareForHostMaintenanceCmd.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333

3434
@APICommand(name = "prepareHostForMaintenance", description = "Prepares a host for maintenance.", responseObject = HostResponse.class,
3535
requestHasSensitiveInfo = false, responseHasSensitiveInfo = false)
36-
public class PrepareForMaintenanceCmd extends BaseAsyncCmd {
36+
public class PrepareForHostMaintenanceCmd extends BaseAsyncCmd {
3737

3838

3939
/////////////////////////////////////////////////////

api/src/main/java/org/apache/cloudstack/api/response/AsyncJobResponse.java

+12-4
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,13 @@ public class AsyncJobResponse extends BaseResponse {
8383
@Param(description = "the unique ID of the instance/entity object related to the job")
8484
private String jobInstanceId;
8585

86-
@SerializedName("managementserverid")
86+
@SerializedName(ApiConstants.MANAGEMENT_SERVER_ID)
8787
@Param(description = "the msid of the management server on which the job is running", since = "4.19")
88-
private Long msid;
88+
private String managementServerId;
89+
90+
@SerializedName(ApiConstants.MANAGEMENT_SERVER_NAME)
91+
@Param(description = "the management server name of the host", since = "4.21.0")
92+
private String managementServerName;
8993

9094
@SerializedName(ApiConstants.CREATED)
9195
@Param(description = " the created date of the job")
@@ -156,7 +160,11 @@ public void setRemoved(final Date removed) {
156160
this.removed = removed;
157161
}
158162

159-
public void setMsid(Long msid) {
160-
this.msid = msid;
163+
public void setManagementServerId(String managementServerId) {
164+
this.managementServerId = managementServerId;
165+
}
166+
167+
public void setManagementServerName(String managementServerName) {
168+
this.managementServerName = managementServerName;
161169
}
162170
}

0 commit comments

Comments
 (0)