Skip to content

Commit 2daffa3

Browse files
committed
Merge release branch 4.20 to main
* 4.20: VR: fix site-2-site VPN if split connections is enabled (#10067) UI: fix cannot open 'Edit tags' modal for static routes (#10065) Update ownership selection component to be language independent (#10052) Support to enable/disable VM High Availability manager and related alerts (#10118)
2 parents b48de4e + 41c27e1 commit 2daffa3

File tree

12 files changed

+297
-37
lines changed

12 files changed

+297
-37
lines changed

engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
*/
3333
public interface HighAvailabilityManager extends Manager {
3434

35-
public ConfigKey<Boolean> ForceHA = new ConfigKey<>("Advanced", Boolean.class, "force.ha", "false",
35+
ConfigKey<Boolean> ForceHA = new ConfigKey<>("Advanced", Boolean.class, "force.ha", "false",
3636
"Force High-Availability to happen even if the VM says no.", true, Cluster);
3737

3838
ConfigKey<Integer> HAWorkers = new ConfigKey<>("Advanced", Integer.class, "ha.workers", "5",
@@ -112,7 +112,7 @@ enum Step {
112112

113113
void cancelDestroy(VMInstanceVO vm, Long hostId);
114114

115-
void scheduleDestroy(VMInstanceVO vm, long hostId);
115+
boolean scheduleDestroy(VMInstanceVO vm, long hostId);
116116

117117
/**
118118
* Schedule restarts for all vms running on the host.
@@ -143,7 +143,7 @@ enum Step {
143143
* @param host host the virtual machine is on.
144144
* @param type which type of stop is requested.
145145
*/
146-
void scheduleStop(VMInstanceVO vm, long hostId, WorkType type);
146+
boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type);
147147

148148
void cancelScheduledMigrations(HostVO host);
149149

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

+115-14
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
// under the License.
1717
package com.cloud.ha;
1818

19+
import static org.apache.cloudstack.framework.config.ConfigKey.Scope.Zone;
20+
1921
import java.util.ArrayList;
2022
import java.util.Date;
2123
import java.util.HashMap;
@@ -121,6 +123,16 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
121123
"Total number of attempts for trying migration of a VM.",
122124
true, ConfigKey.Scope.Global);
123125

126+
public static ConfigKey<Boolean> VmHaEnabled = new ConfigKey<>("Advanced", Boolean.class, "vm.ha.enabled", "true",
127+
"Enable/Disable VM High Availability manager, it is enabled by default."
128+
+ " When enabled, the VM HA WorkItems (for VM Stop, Restart, Migration, Destroy) can be created and the scheduled items are executed; and"
129+
+ " When disabled, new VM HA WorkItems are not allowed and the scheduled items are retried until max retries configured at 'vm.ha.migration.max.retries'"
130+
+ " (executed in case HA is re-enabled during retry attempts), and then purged after 'time.between.failures' by the cleanup thread that runs"
131+
+ " regularly at 'time.between.cleanup'", true, Zone);
132+
133+
protected static ConfigKey<Boolean> VmHaAlertsEnabled = new ConfigKey<>("Advanced", Boolean.class, "vm.ha.alerts.enabled", "true",
134+
"Enable/Disable alerts for the VM HA operations, it is enabled by default.", true, Zone);
135+
124136
WorkerThread[] _workers;
125137
boolean _stopped;
126138
long _timeToSleep;
@@ -185,7 +197,6 @@ public void setHaPlanners(List<HAPlanner> haPlanners) {
185197
_haPlanners = haPlanners;
186198
}
187199

188-
189200
@Inject
190201
AgentManager _agentMgr;
191202
@Inject
@@ -231,6 +242,15 @@ public Status investigate(final long hostId) {
231242
return Status.Alert;
232243
}
233244

245+
if (!VmHaEnabled.valueIn(host.getDataCenterId())) {
246+
String message = String.format("Unable to investigate the host %s (%d), VM high availability manager is disabled.", host.getName(), hostId);
247+
if (logger.isDebugEnabled()) {
248+
logger.debug(message);
249+
}
250+
sendHostAlert(host, message);
251+
return Status.Alert;
252+
}
253+
234254
Status hostState = null;
235255
for (Investigator investigator : investigators) {
236256
hostState = investigator.isAgentAlive(host);
@@ -260,6 +280,15 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate)
260280
return;
261281
}
262282

283+
if (!VmHaEnabled.valueIn(host.getDataCenterId())) {
284+
String message = String.format("Unable to schedule restart for VMs on host %s (%d), VM high availability manager is disabled.", host.getName(), host.getId());
285+
if (logger.isDebugEnabled()) {
286+
logger.debug(message);
287+
}
288+
sendHostAlert(host, message);
289+
return;
290+
}
291+
263292
logger.warn("Scheduling restart for VMs on host " + host.getId() + "-" + host.getName());
264293

265294
final List<VMInstanceVO> vms = _instanceDao.listByHostId(host.getId());
@@ -314,12 +343,21 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate)
314343
}
315344

316345
@Override
317-
public void scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
346+
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
318347
assert (type == WorkType.CheckStop || type == WorkType.ForceStop || type == WorkType.Stop);
319348

320349
if (_haDao.hasBeenScheduled(vm.getId(), type)) {
321350
logger.info("There's already a job scheduled to stop " + vm);
322-
return;
351+
return false;
352+
}
353+
354+
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
355+
String message = String.format("Unable to schedule stop for the VM %s (%d) on host %d, VM high availability manager is disabled.", vm.getName(), vm.getId(), hostId);
356+
if (logger.isDebugEnabled()) {
357+
logger.debug(message);
358+
}
359+
sendVMAlert(vm, message);
360+
return false;
323361
}
324362

325363
HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
@@ -328,6 +366,7 @@ public void scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
328366
logger.debug("Scheduled " + work);
329367
}
330368
wakeupWorkers();
369+
return true;
331370
}
332371

333372
protected void wakeupWorkers() {
@@ -339,17 +378,37 @@ protected void wakeupWorkers() {
339378

340379
@Override
341380
public boolean scheduleMigration(final VMInstanceVO vm) {
342-
if (vm.getHostId() != null) {
343-
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated());
344-
_haDao.persist(work);
345-
logger.info("Scheduled migration work of VM " + vm.getUuid() + " from host " + _hostDao.findById(vm.getHostId()) + " with HAWork " + work);
346-
wakeupWorkers();
381+
if (vm.getHostId() == null) {
382+
return false;
383+
}
384+
385+
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
386+
String message = String.format("Unable to schedule migration for the VM %s (%d) on host %d, VM high availability manager is disabled.", vm.getName(), vm.getId(), vm.getHostId());
387+
if (logger.isDebugEnabled()) {
388+
logger.debug(message);
389+
}
390+
sendVMAlert(vm, message);
391+
return false;
347392
}
393+
394+
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated());
395+
_haDao.persist(work);
396+
logger.info("Scheduled migration work of VM " + vm.getUuid() + " from host " + _hostDao.findById(vm.getHostId()) + " with HAWork " + work);
397+
wakeupWorkers();
348398
return true;
349399
}
350400

351401
@Override
352402
public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
403+
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
404+
String message = String.format("Unable to schedule restart for the VM %s (%d), VM high availability manager is disabled.", vm.getName(), vm.getId());
405+
if (logger.isDebugEnabled()) {
406+
logger.debug(message);
407+
}
408+
sendVMAlert(vm, message);
409+
return;
410+
}
411+
353412
logger.debug("HA schedule restart");
354413
Long hostId = vm.getHostId();
355414
if (hostId == null) {
@@ -440,7 +499,6 @@ public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
440499
}
441500

442501
wakeupWorkers();
443-
444502
}
445503

446504
private void startVm(VirtualMachine vm, Map<VirtualMachineProfile.Param, Object> params,
@@ -737,13 +795,23 @@ public Long migrate(final HaWorkVO work) {
737795
}
738796

739797
@Override
740-
public void scheduleDestroy(VMInstanceVO vm, long hostId) {
798+
public boolean scheduleDestroy(VMInstanceVO vm, long hostId) {
799+
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
800+
String message = String.format("Unable to schedule destroy for the VM %s (%d) on host %d, VM high availability manager is disabled.", vm.getName(), vm.getId(), hostId);
801+
if (logger.isDebugEnabled()) {
802+
logger.debug(message);
803+
}
804+
sendVMAlert(vm, message);
805+
return false;
806+
}
807+
741808
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
742809
_haDao.persist(work);
743810
if (logger.isDebugEnabled()) {
744811
logger.debug("Scheduled " + work.toString());
745812
}
746813
wakeupWorkers();
814+
return true;
747815
}
748816

749817
@Override
@@ -892,7 +960,17 @@ private long getRescheduleTime(WorkType workType) {
892960

893961
private void processWork(final HaWorkVO work) {
894962
final WorkType wt = work.getWorkType();
963+
final VMInstanceVO vm = _instanceDao.findById(work.getInstanceId());
895964
try {
965+
if (vm != null && !VmHaEnabled.valueIn(vm.getDataCenterId())) {
966+
if (logger.isDebugEnabled()) {
967+
logger.debug(String.format("VM high availability manager is disabled, rescheduling the HA work %s, for the VM %s (id) to retry later in case VM high availability manager is enabled on retry attempt", work, vm.getName(), vm.getId()));
968+
}
969+
long nextTime = getRescheduleTime(wt);
970+
rescheduleWork(work, nextTime);
971+
return;
972+
}
973+
896974
Long nextTime = null;
897975
if (wt == WorkType.Migration) {
898976
nextTime = migrate(work);
@@ -921,9 +999,10 @@ private void processWork(final HaWorkVO work) {
921999

9221000
// if restart failed in the middle due to exception, VM state may has been changed
9231001
// recapture into the HA worker so that it can really continue in it next turn
924-
VMInstanceVO vm = _instanceDao.findById(work.getInstanceId());
925-
work.setUpdateTime(vm.getUpdated());
926-
work.setPreviousState(vm.getState());
1002+
if (vm != null) {
1003+
work.setUpdateTime(vm.getUpdated());
1004+
work.setPreviousState(vm.getState());
1005+
}
9271006
} finally {
9281007
if (!Step.Done.equals(work.getStep())) {
9291008
if (work.getTimesTried() >= _maxRetries) {
@@ -1128,11 +1207,33 @@ public String getConfigComponentName() {
11281207
public ConfigKey<?>[] getConfigKeys() {
11291208
return new ConfigKey[] {TimeBetweenCleanup, MigrationMaxRetries, TimeToSleep, TimeBetweenFailures,
11301209
StopRetryInterval, RestartRetryInterval, MigrateRetryInterval, InvestigateRetryInterval,
1131-
HAWorkers, ForceHA, KvmHAFenceHostIfHeartbeatFailsOnStorage};
1210+
HAWorkers, ForceHA, VmHaEnabled, VmHaAlertsEnabled, KvmHAFenceHostIfHeartbeatFailsOnStorage};
11321211
}
11331212

11341213
@Override
11351214
public int expungeWorkItemsByVmList(List<Long> vmIds, Long batchSize) {
11361215
return _haDao.expungeByVmList(vmIds, batchSize);
11371216
}
1217+
1218+
private void sendVMAlert(VMInstanceVO vm, String message) {
1219+
if (vm == null || !VmHaAlertsEnabled.valueIn(vm.getDataCenterId())) {
1220+
return;
1221+
}
1222+
AlertManager.AlertType alertType = AlertManager.AlertType.ALERT_TYPE_USERVM;
1223+
if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) {
1224+
alertType = AlertManager.AlertType.ALERT_TYPE_DOMAIN_ROUTER;
1225+
} else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) {
1226+
alertType = AlertManager.AlertType.ALERT_TYPE_CONSOLE_PROXY;
1227+
} else if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())) {
1228+
alertType = AlertManager.AlertType.ALERT_TYPE_SSVM;
1229+
}
1230+
_alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), message, message);
1231+
}
1232+
1233+
private void sendHostAlert(HostVO host, String message) {
1234+
if (host == null || !VmHaAlertsEnabled.valueIn(host.getDataCenterId())) {
1235+
return;
1236+
}
1237+
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), message, message);
1238+
}
11381239
}

server/src/main/java/com/cloud/resource/ResourceManagerImpl.java

+7-3
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
import com.cloud.cpu.CPU;
4242
import com.cloud.exception.StorageConflictException;
4343
import com.cloud.exception.StorageUnavailableException;
44+
import com.cloud.ha.HighAvailabilityManagerImpl;
4445
import com.cloud.host.HostTagVO;
4546
import com.cloud.storage.Volume;
4647
import com.cloud.storage.VolumeVO;
@@ -1363,6 +1364,11 @@ private boolean doMaintain(final long hostId) {
13631364
throw new CloudRuntimeException("Cannot perform maintain when resource state is " + hostState + ", hostId = " + hostId);
13641365
}
13651366

1367+
final List<VMInstanceVO> vms = _vmDao.listByHostId(hostId);
1368+
if (CollectionUtils.isNotEmpty(vms) && !HighAvailabilityManagerImpl.VmHaEnabled.valueIn(host.getDataCenterId())) {
1369+
throw new CloudRuntimeException(String.format("Cannot perform maintain for the host %s (%d) as there are running VMs on it and VM high availability manager is disabled", host.getName(), hostId));
1370+
}
1371+
13661372
final MaintainAnswer answer = (MaintainAnswer)_agentMgr.easySend(hostId, new MaintainCommand());
13671373
if (answer == null || !answer.getResult()) {
13681374
logger.warn("Unable to send MaintainCommand to host: " + hostId);
@@ -1382,8 +1388,6 @@ private boolean doMaintain(final long hostId) {
13821388

13831389
/* TODO: move below to listener */
13841390
if (host.getType() == Host.Type.Routing) {
1385-
1386-
final List<VMInstanceVO> vms = _vmDao.listByHostId(hostId);
13871391
if (vms.size() == 0) {
13881392
return true;
13891393
}
@@ -2841,7 +2845,7 @@ public void deleteRoutingHost(final HostVO host, final boolean isForced, final b
28412845
logger.debug("Cannot transmit host " + host.getId() + " to Disabled state", e);
28422846
}
28432847
for (final VMInstanceVO vm : vms) {
2844-
if ((! HighAvailabilityManager.ForceHA.value() && !vm.isHaEnabled()) || vm.getState() == State.Stopping) {
2848+
if ((!HighAvailabilityManager.ForceHA.value() && !vm.isHaEnabled()) || vm.getState() == State.Stopping) {
28452849
logger.debug(String.format("Stopping %s as a part of hostDelete for %s",vm, host));
28462850
try {
28472851
_haMgr.scheduleStop(vm, host.getId(), WorkType.Stop);

0 commit comments

Comments
 (0)