19
19
import static org .apache .cloudstack .framework .config .ConfigKey .Scope .Zone ;
20
20
21
21
import java .util .ArrayList ;
22
+ import java .util .Arrays ;
22
23
import java .util .Date ;
23
24
import java .util .HashMap ;
24
25
import java .util .List ;
43
44
import org .apache .cloudstack .managed .context .ManagedContext ;
44
45
import org .apache .cloudstack .managed .context .ManagedContextRunnable ;
45
46
import org .apache .cloudstack .management .ManagementServerHost ;
47
+ import org .apache .logging .log4j .ThreadContext ;
46
48
47
49
import com .cloud .agent .AgentManager ;
48
50
import com .cloud .alert .AlertManager ;
90
92
import com .cloud .vm .VirtualMachineManager ;
91
93
import com .cloud .vm .VirtualMachineProfile ;
92
94
import com .cloud .vm .dao .VMInstanceDao ;
93
- import org .apache .logging .log4j .ThreadContext ;
94
95
95
96
/**
96
97
* HighAvailabilityManagerImpl coordinates the HA process. VMs are registered with the HA Manager for HA. The request is stored
@@ -133,6 +134,9 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
133
134
protected static ConfigKey <Boolean > VmHaAlertsEnabled = new ConfigKey <>("Advanced" , Boolean .class , "vm.ha.alerts.enabled" , "true" ,
134
135
"Enable/Disable alerts for the VM HA operations, it is enabled by default." , true , Zone );
135
136
137
+ protected static final List <ReasonType > CancellableWorkReasonTypes =
138
+ Arrays .asList (ReasonType .HostMaintenance , ReasonType .HostDown , ReasonType .HostDegraded );
139
+
136
140
WorkerThread [] _workers ;
137
141
boolean _stopped ;
138
142
long _timeToSleep ;
@@ -269,8 +273,7 @@ public Status investigate(final long hostId) {
269
273
}
270
274
271
275
@ Override
272
- public void scheduleRestartForVmsOnHost (final HostVO host , boolean investigate ) {
273
-
276
+ public void scheduleRestartForVmsOnHost (final HostVO host , boolean investigate , ReasonType reasonType ) {
274
277
if (host .getType () != Host .Type .Routing ) {
275
278
return ;
276
279
}
@@ -337,12 +340,12 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate)
337
340
logger .debug ("VM {} is not on down host {} it is on other host {} VM HA is done" , vm , host , hostId );
338
341
continue ;
339
342
}
340
- scheduleRestart (vm , investigate );
343
+ scheduleRestart (vm , investigate , reasonType );
341
344
}
342
345
}
343
346
344
347
@ Override
345
- public boolean scheduleStop (VMInstanceVO vm , long hostId , WorkType type ) {
348
+ public boolean scheduleStop (VMInstanceVO vm , long hostId , WorkType type , ReasonType reasonType ) {
346
349
assert (type == WorkType .CheckStop || type == WorkType .ForceStop || type == WorkType .Stop );
347
350
348
351
if (_haDao .hasBeenScheduled (vm .getId (), type )) {
@@ -359,7 +362,7 @@ public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
359
362
return false ;
360
363
}
361
364
362
- HaWorkVO work = new HaWorkVO (vm .getId (), vm .getType (), type , Step .Scheduled , hostId , vm .getState (), 0 , vm .getUpdated ());
365
+ HaWorkVO work = new HaWorkVO (vm .getId (), vm .getType (), type , Step .Scheduled , hostId , vm .getState (), 0 , vm .getUpdated (), reasonType );
363
366
_haDao .persist (work );
364
367
if (logger .isDebugEnabled ()) {
365
368
logger .debug ("Scheduled " + work );
@@ -368,6 +371,11 @@ public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
368
371
return true ;
369
372
}
370
373
374
+ @ Override
375
+ public boolean scheduleStop (VMInstanceVO vm , long hostId , WorkType type ) {
376
+ return scheduleStop (vm , hostId , type , null );
377
+ }
378
+
371
379
protected void wakeupWorkers () {
372
380
logger .debug ("Wakeup workers HA" );
373
381
for (WorkerThread worker : _workers ) {
@@ -376,7 +384,7 @@ protected void wakeupWorkers() {
376
384
}
377
385
378
386
@ Override
379
- public boolean scheduleMigration (final VMInstanceVO vm ) {
387
+ public boolean scheduleMigration (final VMInstanceVO vm , ReasonType reasonType ) {
380
388
if (vm .getHostId () == null ) {
381
389
return false ;
382
390
}
@@ -390,15 +398,20 @@ public boolean scheduleMigration(final VMInstanceVO vm) {
390
398
return false ;
391
399
}
392
400
393
- final HaWorkVO work = new HaWorkVO (vm .getId (), vm .getType (), WorkType .Migration , Step .Scheduled , vm .getHostId (), vm .getState (), 0 , vm .getUpdated ());
401
+ final HaWorkVO work = new HaWorkVO (vm .getId (), vm .getType (), WorkType .Migration , Step .Scheduled , vm .getHostId (), vm .getState (), 0 , vm .getUpdated (), reasonType );
394
402
_haDao .persist (work );
395
403
logger .info ("Scheduled migration work of VM {} from host {} with HAWork {}" , vm , _hostDao .findById (vm .getHostId ()), work );
396
404
wakeupWorkers ();
397
405
return true ;
398
406
}
399
407
400
408
@ Override
401
- public void scheduleRestart (VMInstanceVO vm , boolean investigate ) {
409
+ public boolean scheduleMigration (final VMInstanceVO vm ) {
410
+ return scheduleMigration (vm , null );
411
+ }
412
+
413
+ @ Override
414
+ public void scheduleRestart (VMInstanceVO vm , boolean investigate , ReasonType reasonType ) {
402
415
if (!VmHaEnabled .valueIn (vm .getDataCenterId ())) {
403
416
String message = String .format ("Unable to schedule restart for the VM %s (%d), VM high availability manager is disabled." , vm .getName (), vm .getId ());
404
417
if (logger .isDebugEnabled ()) {
@@ -490,7 +503,7 @@ public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
490
503
}
491
504
492
505
HaWorkVO work = new HaWorkVO (vm .getId (), vm .getType (), WorkType .HA , investigate ? Step .Investigating : Step .Scheduled ,
493
- hostId != null ? hostId : 0L , vm .getState (), timesTried , vm .getUpdated ());
506
+ hostId != null ? hostId : 0L , vm .getState (), timesTried , vm .getUpdated (), reasonType );
494
507
_haDao .persist (work );
495
508
496
509
if (logger .isInfoEnabled ()) {
@@ -500,6 +513,11 @@ public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
500
513
wakeupWorkers ();
501
514
}
502
515
516
+ @ Override
517
+ public void scheduleRestart (VMInstanceVO vm , boolean investigate ) {
518
+ scheduleRestart (vm , investigate , null );
519
+ }
520
+
503
521
private void startVm (VirtualMachine vm , Map <VirtualMachineProfile .Param , Object > params ,
504
522
DeploymentPlanner planner ) throws InsufficientCapacityException , ResourceUnavailableException ,
505
523
ConcurrentOperationException , OperationTimedoutException {
@@ -561,6 +579,9 @@ protected Long restart(final HaWorkVO work) {
561
579
logger .info ("Unable to find vm: " + vmId );
562
580
return null ;
563
581
}
582
+ if (checkAndCancelWorkIfNeeded (work )) {
583
+ return null ;
584
+ }
564
585
565
586
logger .info ("HA on " + vm );
566
587
if (vm .getState () != work .getPreviousState () || vm .getUpdated () != work .getUpdateTime ()) {
@@ -762,6 +783,22 @@ protected Long restart(final HaWorkVO work) {
762
783
return (System .currentTimeMillis () >> 10 ) + _restartRetryInterval ;
763
784
}
764
785
786
+ protected boolean checkAndCancelWorkIfNeeded (final HaWorkVO work ) {
787
+ if (!Step .Investigating .equals (work .getStep ())) {
788
+ return false ;
789
+ }
790
+ if (!CancellableWorkReasonTypes .contains (work .getReasonType ())) {
791
+ return false ;
792
+ }
793
+ Status hostStatus = investigate (work .getHostId ());
794
+ if (!Status .Up .equals (hostStatus )) {
795
+ return false ;
796
+ }
797
+ logger .debug ("Cancelling {} as it is not needed anymore" , () -> work );
798
+ work .setStep (Step .Cancelled );
799
+ return true ;
800
+ }
801
+
765
802
public Long migrate (final HaWorkVO work ) {
766
803
long vmId = work .getInstanceId ();
767
804
long srcHostId = work .getHostId ();
@@ -772,6 +809,9 @@ public Long migrate(final HaWorkVO work) {
772
809
logger .info ("Unable to find vm: " + vmId + ", skipping migrate." );
773
810
return null ;
774
811
}
812
+ if (checkAndCancelWorkIfNeeded (work )) {
813
+ return null ;
814
+ }
775
815
logger .info ("Migration attempt: for VM {}from host {}. Starting attempt: {}/{} times." , vm , srcHost , 1 + work .getTimesTried (), _maxRetries );
776
816
try {
777
817
work .setStep (Step .Migrating );
@@ -791,7 +831,7 @@ public Long migrate(final HaWorkVO work) {
791
831
}
792
832
793
833
@ Override
794
- public boolean scheduleDestroy (VMInstanceVO vm , long hostId ) {
834
+ public boolean scheduleDestroy (VMInstanceVO vm , long hostId , ReasonType reasonType ) {
795
835
if (!VmHaEnabled .valueIn (vm .getDataCenterId ())) {
796
836
String message = String .format ("Unable to schedule destroy for the VM %s (%d) on host %d, VM high availability manager is disabled." , vm .getName (), vm .getId (), hostId );
797
837
if (logger .isDebugEnabled ()) {
@@ -801,7 +841,7 @@ public boolean scheduleDestroy(VMInstanceVO vm, long hostId) {
801
841
return false ;
802
842
}
803
843
804
- final HaWorkVO work = new HaWorkVO (vm .getId (), vm .getType (), WorkType .Destroy , Step .Scheduled , hostId , vm .getState (), 0 , vm .getUpdated ());
844
+ final HaWorkVO work = new HaWorkVO (vm .getId (), vm .getType (), WorkType .Destroy , Step .Scheduled , hostId , vm .getState (), 0 , vm .getUpdated (), reasonType );
805
845
_haDao .persist (work );
806
846
if (logger .isDebugEnabled ()) {
807
847
logger .debug ("Scheduled " + work .toString ());
@@ -838,6 +878,9 @@ protected Long destroyVM(final HaWorkVO work) {
838
878
logger .info ("No longer can find VM " + work .getInstanceId () + ". Throwing away " + work );
839
879
return null ;
840
880
}
881
+ if (checkAndCancelWorkIfNeeded (work )) {
882
+ return null ;
883
+ }
841
884
boolean expunge = VirtualMachine .Type .SecondaryStorageVm .equals (vm .getType ())
842
885
|| VirtualMachine .Type .ConsoleProxy .equals (vm .getType ());
843
886
if (!expunge && VirtualMachine .State .Destroyed .equals (work .getPreviousState ())) {
@@ -872,6 +915,9 @@ protected Long stopVM(final HaWorkVO work) throws ConcurrentOperationException {
872
915
work .setStep (Step .Done );
873
916
return null ;
874
917
}
918
+ if (checkAndCancelWorkIfNeeded (work )) {
919
+ return null ;
920
+ }
875
921
logger .info ("Stopping " + vm );
876
922
try {
877
923
if (work .getWorkType () == WorkType .Stop ) {
@@ -1057,6 +1103,8 @@ public boolean configure(final String name, final Map<String, Object> xmlParams)
1057
1103
public boolean start () {
1058
1104
_stopped = false ;
1059
1105
1106
+ _haDao .markPendingWorksAsInvestigating ();
1107
+
1060
1108
for (final WorkerThread thread : _workers ) {
1061
1109
thread .start ();
1062
1110
}
@@ -1074,6 +1122,8 @@ public boolean stop() {
1074
1122
1075
1123
_executor .shutdown ();
1076
1124
1125
+ _haDao .markServerPendingWorksAsInvestigating (_msServer .getId ());
1126
+
1077
1127
return true ;
1078
1128
}
1079
1129
0 commit comments