Skip to content

Commit 048649d

Browse files
committed
Merge release branch 4.20 to main
* 4.20: server: investigate pending HA work when executing in new MS session (#10167) extra null guard (#10264)
2 parents 789e269 + 717ce98 commit 048649d

File tree

11 files changed

+262
-46
lines changed

11 files changed

+262
-46
lines changed

engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java

+16-5
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,13 @@ public enum WorkType {
8484
HA; // Restart a VM.
8585
}
8686

87+
enum ReasonType {
88+
Unknown,
89+
HostMaintenance,
90+
HostDown,
91+
HostDegraded;
92+
}
93+
8794
enum Step {
8895
Scheduled, Investigating, Fencing, Stopping, Restarting, Migrating, Cancelled, Done, Error,
8996
}
@@ -92,7 +99,7 @@ enum Step {
9299
* Investigate why a host has disconnected and migrate the VMs on it
93100
* if necessary.
94101
*
95-
* @param host - the host that has disconnected.
102+
* @param hostId - the id of the host that has disconnected.
96103
*/
97104
Status investigate(long hostId);
98105

@@ -109,17 +116,19 @@ enum Step {
109116
* @param investigate must be investigated before we do anything with this vm.
110117
*/
111118
void scheduleRestart(VMInstanceVO vm, boolean investigate);
119+
void scheduleRestart(VMInstanceVO vm, boolean investigate, ReasonType reasonType);
112120

113121
void cancelDestroy(VMInstanceVO vm, Long hostId);
114122

115-
boolean scheduleDestroy(VMInstanceVO vm, long hostId);
123+
boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType reasonType);
116124

117125
/**
118126
* Schedule restarts for all vms running on the host.
119127
* @param host host.
120-
* @param investigate TODO
128+
* @param investigate whether to investigate
129+
* @param reasonType reason for HA work
121130
*/
122-
void scheduleRestartForVmsOnHost(HostVO host, boolean investigate);
131+
void scheduleRestartForVmsOnHost(HostVO host, boolean investigate, ReasonType reasonType);
123132

124133
/**
125134
* Schedule the vm for migration.
@@ -128,6 +137,7 @@ enum Step {
128137
* @return true if schedule worked.
129138
*/
130139
boolean scheduleMigration(VMInstanceVO vm);
140+
boolean scheduleMigration(VMInstanceVO vm, ReasonType reasonType);
131141

132142
List<VMInstanceVO> findTakenMigrationWork();
133143

@@ -140,10 +150,11 @@ enum Step {
140150
* 3. Check if a VM has been stopped: WorkType.CheckStop
141151
*
142152
* @param vm virtual machine to stop.
143-
* @param host host the virtual machine is on.
153+
* @param hostId the id of the host the virtual machine is on.
144154
* @param type which type of stop is requested.
145155
*/
146156
boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type);
157+
boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type, ReasonType reasonType);
147158

148159
void cancelScheduledMigrations(HostVO host);
149160

engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -633,11 +633,11 @@ protected AgentAttache notifyMonitorsOfConnection(final AgentAttache attache, fi
633633
}
634634
} catch (final HypervisorVersionChangedException hvce) {
635635
handleDisconnectWithoutInvestigation(attache, Event.ShutdownRequested, true, true);
636-
throw new CloudRuntimeException("Unable to connect " + attache.getId(), hvce);
636+
throw new CloudRuntimeException("Unable to connect " + (attache == null ? "<unknown agent>" : attache.getId()), hvce);
637637
} catch (final Exception e) {
638638
logger.error("Monitor {} says there is an error in the connect process for {} due to {}", monitor.second().getClass().getSimpleName(), hostId, e.getMessage(), e);
639639
handleDisconnectWithoutInvestigation(attache, Event.AgentDisconnected, true, true);
640-
throw new CloudRuntimeException("Unable to connect " + attache.getId(), e);
640+
throw new CloudRuntimeException("Unable to connect " + (attache == null ? "<unknown agent>" : attache.getId()), e);
641641
}
642642
}
643643
}
@@ -989,7 +989,7 @@ protected boolean handleDisconnectWithInvestigation(final AgentAttache attache,
989989
handleDisconnectWithoutInvestigation(attache, event, true, true);
990990
host = _hostDao.findById(hostId); // Maybe the host magically reappeared?
991991
if (host != null && host.getStatus() == Status.Down) {
992-
_haMgr.scheduleRestartForVmsOnHost(host, true);
992+
_haMgr.scheduleRestartForVmsOnHost(host, true, HighAvailabilityManager.ReasonType.HostDown);
993993
}
994994
return true;
995995
}

engine/schema/src/main/resources/META-INF/db/schema-42000to42010.sql

+3
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,6 @@ CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.volumes', 'last_id', 'bigint(20) uns
3535

3636
-- Add used_iops column to support IOPS data in storage stats
3737
CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.storage_pool', 'used_iops', 'bigint unsigned DEFAULT NULL COMMENT "IOPS currently in use for this storage pool" ');
38+
39+
-- Add reason column for op_ha_work
40+
CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.op_ha_work', 'reason', 'varchar(32) DEFAULT NULL COMMENT "Reason for the HA work"');

server/src/main/java/com/cloud/ha/HaWorkVO.java

+14-1
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,10 @@ public class HaWorkVO implements InternalIdentity {
8686
@Column(name = "tried")
8787
int timesTried;
8888

89+
@Column(name = "reason")
90+
@Enumerated(value = EnumType.STRING)
91+
private HighAvailabilityManager.ReasonType reasonType;
92+
8993
protected HaWorkVO() {
9094
}
9195

@@ -179,7 +183,7 @@ public void setPreviousState(State state) {
179183
}
180184

181185
public HaWorkVO(final long instanceId, final VirtualMachine.Type type, final WorkType workType, final Step step, final long hostId, final State previousState,
182-
final int timesTried, final long updated) {
186+
final int timesTried, final long updated, HighAvailabilityManager.ReasonType reasonType) {
183187
this.workType = workType;
184188
this.type = type;
185189
this.instanceId = instanceId;
@@ -191,6 +195,7 @@ public HaWorkVO(final long instanceId, final VirtualMachine.Type type, final Wor
191195
this.step = step;
192196
this.timeToTry = System.currentTimeMillis() >> 10;
193197
this.updateTime = updated;
198+
this.reasonType = reasonType;
194199
}
195200

196201
@Override
@@ -207,4 +212,12 @@ public String toString() {
207212
.append("]")
208213
.toString();
209214
}
215+
216+
public HighAvailabilityManager.ReasonType getReasonType() {
217+
return reasonType;
218+
}
219+
220+
public void setReasonType(HighAvailabilityManager.ReasonType reasonType) {
221+
this.reasonType = reasonType;
222+
}
210223
}

server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java

+62-12
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import static org.apache.cloudstack.framework.config.ConfigKey.Scope.Zone;
2020

2121
import java.util.ArrayList;
22+
import java.util.Arrays;
2223
import java.util.Date;
2324
import java.util.HashMap;
2425
import java.util.List;
@@ -43,6 +44,7 @@
4344
import org.apache.cloudstack.managed.context.ManagedContext;
4445
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
4546
import org.apache.cloudstack.management.ManagementServerHost;
47+
import org.apache.logging.log4j.ThreadContext;
4648

4749
import com.cloud.agent.AgentManager;
4850
import com.cloud.alert.AlertManager;
@@ -90,7 +92,6 @@
9092
import com.cloud.vm.VirtualMachineManager;
9193
import com.cloud.vm.VirtualMachineProfile;
9294
import com.cloud.vm.dao.VMInstanceDao;
93-
import org.apache.logging.log4j.ThreadContext;
9495

9596
/**
9697
* HighAvailabilityManagerImpl coordinates the HA process. VMs are registered with the HA Manager for HA. The request is stored
@@ -133,6 +134,9 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
133134
protected static ConfigKey<Boolean> VmHaAlertsEnabled = new ConfigKey<>("Advanced", Boolean.class, "vm.ha.alerts.enabled", "true",
134135
"Enable/Disable alerts for the VM HA operations, it is enabled by default.", true, Zone);
135136

137+
protected static final List<ReasonType> CancellableWorkReasonTypes =
138+
Arrays.asList(ReasonType.HostMaintenance, ReasonType.HostDown, ReasonType.HostDegraded);
139+
136140
WorkerThread[] _workers;
137141
boolean _stopped;
138142
long _timeToSleep;
@@ -269,8 +273,7 @@ public Status investigate(final long hostId) {
269273
}
270274

271275
@Override
272-
public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate) {
273-
276+
public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate, ReasonType reasonType) {
274277
if (host.getType() != Host.Type.Routing) {
275278
return;
276279
}
@@ -337,12 +340,12 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate)
337340
logger.debug("VM {} is not on down host {} it is on other host {} VM HA is done", vm, host, hostId);
338341
continue;
339342
}
340-
scheduleRestart(vm, investigate);
343+
scheduleRestart(vm, investigate, reasonType);
341344
}
342345
}
343346

344347
@Override
345-
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
348+
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type, ReasonType reasonType) {
346349
assert (type == WorkType.CheckStop || type == WorkType.ForceStop || type == WorkType.Stop);
347350

348351
if (_haDao.hasBeenScheduled(vm.getId(), type)) {
@@ -359,7 +362,7 @@ public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
359362
return false;
360363
}
361364

362-
HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
365+
HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated(), reasonType);
363366
_haDao.persist(work);
364367
if (logger.isDebugEnabled()) {
365368
logger.debug("Scheduled " + work);
@@ -368,6 +371,11 @@ public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
368371
return true;
369372
}
370373

374+
@Override
375+
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
376+
return scheduleStop(vm, hostId, type, null);
377+
}
378+
371379
protected void wakeupWorkers() {
372380
logger.debug("Wakeup workers HA");
373381
for (WorkerThread worker : _workers) {
@@ -376,7 +384,7 @@ protected void wakeupWorkers() {
376384
}
377385

378386
@Override
379-
public boolean scheduleMigration(final VMInstanceVO vm) {
387+
public boolean scheduleMigration(final VMInstanceVO vm, ReasonType reasonType) {
380388
if (vm.getHostId() == null) {
381389
return false;
382390
}
@@ -390,15 +398,20 @@ public boolean scheduleMigration(final VMInstanceVO vm) {
390398
return false;
391399
}
392400

393-
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated());
401+
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated(), reasonType);
394402
_haDao.persist(work);
395403
logger.info("Scheduled migration work of VM {} from host {} with HAWork {}", vm, _hostDao.findById(vm.getHostId()), work);
396404
wakeupWorkers();
397405
return true;
398406
}
399407

400408
@Override
401-
public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
409+
public boolean scheduleMigration(final VMInstanceVO vm) {
410+
return scheduleMigration(vm, null);
411+
}
412+
413+
@Override
414+
public void scheduleRestart(VMInstanceVO vm, boolean investigate, ReasonType reasonType) {
402415
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
403416
String message = String.format("Unable to schedule restart for the VM %s (%d), VM high availability manager is disabled.", vm.getName(), vm.getId());
404417
if (logger.isDebugEnabled()) {
@@ -490,7 +503,7 @@ public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
490503
}
491504

492505
HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.HA, investigate ? Step.Investigating : Step.Scheduled,
493-
hostId != null ? hostId : 0L, vm.getState(), timesTried, vm.getUpdated());
506+
hostId != null ? hostId : 0L, vm.getState(), timesTried, vm.getUpdated(), reasonType);
494507
_haDao.persist(work);
495508

496509
if (logger.isInfoEnabled()) {
@@ -500,6 +513,11 @@ public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
500513
wakeupWorkers();
501514
}
502515

516+
@Override
517+
public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
518+
scheduleRestart(vm, investigate, null);
519+
}
520+
503521
private void startVm(VirtualMachine vm, Map<VirtualMachineProfile.Param, Object> params,
504522
DeploymentPlanner planner) throws InsufficientCapacityException, ResourceUnavailableException,
505523
ConcurrentOperationException, OperationTimedoutException {
@@ -561,6 +579,9 @@ protected Long restart(final HaWorkVO work) {
561579
logger.info("Unable to find vm: " + vmId);
562580
return null;
563581
}
582+
if (checkAndCancelWorkIfNeeded(work)) {
583+
return null;
584+
}
564585

565586
logger.info("HA on " + vm);
566587
if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) {
@@ -762,6 +783,22 @@ protected Long restart(final HaWorkVO work) {
762783
return (System.currentTimeMillis() >> 10) + _restartRetryInterval;
763784
}
764785

786+
protected boolean checkAndCancelWorkIfNeeded(final HaWorkVO work) {
787+
if (!Step.Investigating.equals(work.getStep())) {
788+
return false;
789+
}
790+
if (!CancellableWorkReasonTypes.contains(work.getReasonType())) {
791+
return false;
792+
}
793+
Status hostStatus = investigate(work.getHostId());
794+
if (!Status.Up.equals(hostStatus)) {
795+
return false;
796+
}
797+
logger.debug("Cancelling {} as it is not needed anymore", () -> work);
798+
work.setStep(Step.Cancelled);
799+
return true;
800+
}
801+
765802
public Long migrate(final HaWorkVO work) {
766803
long vmId = work.getInstanceId();
767804
long srcHostId = work.getHostId();
@@ -772,6 +809,9 @@ public Long migrate(final HaWorkVO work) {
772809
logger.info("Unable to find vm: " + vmId + ", skipping migrate.");
773810
return null;
774811
}
812+
if (checkAndCancelWorkIfNeeded(work)) {
813+
return null;
814+
}
775815
logger.info("Migration attempt: for VM {}from host {}. Starting attempt: {}/{} times.", vm, srcHost, 1 + work.getTimesTried(), _maxRetries);
776816
try {
777817
work.setStep(Step.Migrating);
@@ -791,7 +831,7 @@ public Long migrate(final HaWorkVO work) {
791831
}
792832

793833
@Override
794-
public boolean scheduleDestroy(VMInstanceVO vm, long hostId) {
834+
public boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType reasonType) {
795835
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
796836
String message = String.format("Unable to schedule destroy for the VM %s (%d) on host %d, VM high availability manager is disabled.", vm.getName(), vm.getId(), hostId);
797837
if (logger.isDebugEnabled()) {
@@ -801,7 +841,7 @@ public boolean scheduleDestroy(VMInstanceVO vm, long hostId) {
801841
return false;
802842
}
803843

804-
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
844+
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated(), reasonType);
805845
_haDao.persist(work);
806846
if (logger.isDebugEnabled()) {
807847
logger.debug("Scheduled " + work.toString());
@@ -838,6 +878,9 @@ protected Long destroyVM(final HaWorkVO work) {
838878
logger.info("No longer can find VM " + work.getInstanceId() + ". Throwing away " + work);
839879
return null;
840880
}
881+
if (checkAndCancelWorkIfNeeded(work)) {
882+
return null;
883+
}
841884
boolean expunge = VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())
842885
|| VirtualMachine.Type.ConsoleProxy.equals(vm.getType());
843886
if (!expunge && VirtualMachine.State.Destroyed.equals(work.getPreviousState())) {
@@ -872,6 +915,9 @@ protected Long stopVM(final HaWorkVO work) throws ConcurrentOperationException {
872915
work.setStep(Step.Done);
873916
return null;
874917
}
918+
if (checkAndCancelWorkIfNeeded(work)) {
919+
return null;
920+
}
875921
logger.info("Stopping " + vm);
876922
try {
877923
if (work.getWorkType() == WorkType.Stop) {
@@ -1057,6 +1103,8 @@ public boolean configure(final String name, final Map<String, Object> xmlParams)
10571103
public boolean start() {
10581104
_stopped = false;
10591105

1106+
_haDao.markPendingWorksAsInvestigating();
1107+
10601108
for (final WorkerThread thread : _workers) {
10611109
thread.start();
10621110
}
@@ -1074,6 +1122,8 @@ public boolean stop() {
10741122

10751123
_executor.shutdown();
10761124

1125+
_haDao.markServerPendingWorksAsInvestigating(_msServer.getId());
1126+
10771127
return true;
10781128
}
10791129

server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java

+2
Original file line numberDiff line numberDiff line change
@@ -86,4 +86,6 @@ public interface HighAvailabilityDao extends GenericDao<HaWorkVO, Long> {
8686

8787
List<HaWorkVO> listPendingMigrationsForVm(long vmId);
8888
int expungeByVmList(List<Long> vmIds, Long batchSize);
89+
void markPendingWorksAsInvestigating();
90+
void markServerPendingWorksAsInvestigating(long managementServerId);
8991
}

0 commit comments

Comments
 (0)