This repository was archived by the owner on Apr 10, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathloop_control.tex
1952 lines (1805 loc) · 84.2 KB
/
loop_control.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% vim: ts=4 sw=4 et ft=tex
\chapter{Loop Control}
\label{chap:lc}
\label{chap:loop_control}
\status{Peter has checked the first 3 sections and I will retrieve them from
him later.
The next three sections are now ready for someone to check.}
In the previous chapter we described our
system that uses profiling data
to automatically parallelise Mercury programs by
finding conjunctions with expensive conjuncts
that can run in parallel with minimal synchronisation delays.
This worked very well in some programs but not as well as we had hoped for
others,
including the raytracer.
This is because the way Mercury must execute dependent conjunctions
and the way programmers typically write logic programs are at odds.
We introduced this as
``the right recursion problem''
in Section~\ref{sec:rts_original_scheduling_performance}.
In this chapter we present a novel program transformation that eliminates
this problem in all situations.
The transformation has several benefits:
First, it reduces peak memory consumption
by putting a limit on how many stacks
a conjunction will need to have in memory at the same time.
Second,
it reduces the number of synchronisation barriers needed
from one per loop iteration to one per loop.
Third, it allows recursive calls inside parallel conjunctions to take
advantage of tail recursion optimisation.
Finally, it obsoletes the conjunct reordering transformation.
Our benchmark results show that our new transformation
greatly increases the speedups we can get from parallelising Mercury
programs;
in one case, it changes no speedup into almost perfect speedup on four cores.
We have written about the problem elsewhere in the dissertation,
however we have found that this problem is sometimes difficult to
understand.
Therefore
the introduction section
(Section~\ref{sec:lc_intro})
briefly describes the problem,
providing only the details necessary to understand and evaluate the rest of
this chapter.
\paul{It also motivates the reader correctly, but I do not say this.}
For more details about the problem see
Sections~\ref{sec:rts_original_scheduling}
and~\ref{sec:rts_original_scheduling_performance},
see also \citet{bone:2012:loop_control}, the paper on which this chapter is
based.
The rest of the chapter is organised as follows.
Section~\ref{sec:lc_transformation} describes
the program transformation we have developed
to control memory consumption by loops.
Section~\ref{sec:lc_perf} evaluates
how our system works in practice on some benchmarks.
Section~\ref{sec:lc_further_work}
describes potential further work,
and
Section~\ref{sec:lc_conc} concludes with discussion of related work.
\section{Introduction}
\label{sec:lc_intro}
\status{This section is ready for Peter Schachte to check.}
The implementation of a parallel conjunction
has to execute the first conjunct after spawning off the later conjuncts.
For dependent conjunctions, it cannot be done the other way around,
because only the first conjunct is guaranteed to be immediately executable:
later conjuncts may need to wait for data to be generated by earlier
conjuncts.
This poses a problem when the last conjunct contains a recursive call:
\begin{itemize}
\item
the state of the computation up to this iteration of the loop
is stored in the stack used by the original computation's context,
whereas
\item
the state of the computation after this iteration of the loop
is stored in the stack used by the spawned off context.
\end{itemize}
We can continue the computation after the parallel conjunction
only if we have both the original stack
and the results computed on the spawned-off stack.
This means the original stack must be kept in memory
until the recursive call is done.
However, there is no way to distinguish
the first iteration of a loop from the second, third etc,
so we must preserve the original stack on \emph{every} iteration.
This problem is very common,
since in logic programming languages,
tail recursive code has long been the preferred way to write a loop.
In independent code,
we can workaround the issue by reordering the conjuncts
in a conjunction (see Section~\ref{sec:rts_reorder}),
but in dependent code this is not possible.
We call this the ``right recursion problem'' because its effects are at
their worst when the recursive call is on the right hand side of a
parallel conjunction operator.
Unfortunately,
this is a natural way to (manually or automatically) parallelise programs
that were originally written with tail recursion in mind.
Thus, parallelisation often
transforms tail recursive sequential computations,
which run in constant stack space,
into parallel computations
that allocate a complete stack for each recursive call
and do not free them until the recursive call returns.
This means that each iteration effectively requires memory
to store an entire stack, not just a stack frame.
\picfigurelabel{linear_context_usage}{fig:linear_context_usage2}{Linear context usage in right recursion}
Figure~\ref{fig:linear_context_usage2} shows a visualisation of this stack
usage.
At the top left,
four contexts are created to execute four iterations of a loop,
as indicated by boxes.
Once each of these iterations finishes,
its context stays in memory but is suspended,
indicated by the long vertical lines.
Another four iterations of the loop create another four contexts, and so on.
Later, when all iterations of the loop have been executed,
each of the blocked contexts resumes execution and immediately exits,
indicated by the horizontal line at the bottom of each of the vertical
lines.
If we allow the number of contexts, and therefore their stacks, to grow
unbounded, then the program will very quickly run out of memory,
often bringing the operating system to its knees.
This is why we introduced the context limit work around
(described in Section~\ref{sec:rts_original_scheduling_performance}),
which can prevent a program from crashing,
but which limits the amount of parallel execution.
% The natural way to execute a parallel version of such a loop
% is to keep spawning off the task of performing each iteration
% until all the available cores are busy
%
Our transformation explicitly limits
the number of stacks allocated to recursive calls
to a small multiple of the number of available processors in the system.
This transformation can also be asked
to remove the dependency of a parallel loop iteration
on the parent stack frame from which it was spawned,
allowing the parent frame to be reclaimed
before the completion of the recursive call.
This allows parallel tail recursive computations
to run in constant stack space.
The transformation is applied
after the automatic parallelisation transformation,
so it benefits both manually and automatically parallelised Mercury code.
% Our benchmark results are very encouraging.
% Limiting the number of stacks
% not only permits deep tail recursions to take advantage of multiple cores,
% but it also significantly improves performance.
% For most of our benchmarks, we get near-optimal speedups.
% % The second
% % transformation does not produce a speed improvement; in some cases it
% % causes a slight slow-down. However, this small price is well worth
% % paying to allow parallel tail recursive computations to run in
% % constant stack space.
% % This is the old explaination, which is now covered in Ch3.
% \section{The main problem}
% \label{sec:problem}
%
% As Mercury is a declarative programming language,
% Mercury programs make heavy use of recursion.
% Like the compilers for most declarative languages,
% the Mercury compiler optimises tail recursive procedures
% into code that can run in constant stack space.
% Since this generally makes tail recursive computations
% more efficient than code using other forms of recursion,
% typical Mercury code makes heavy use of tail recursion in particular.
%
% Unfortunately, tail recursive computations are not naturally handled well
% by Mercury's implementation of parallel conjunctions.
% Consider the \mapfoldl{} predicate in Figure~\ref{fig:mapfoldl}.
% This code applies the map predicate \code{M} to each element of an input list,
% and then uses the fold predicate \code{F}
% to accumulate (in a left-to-right order) all the results produced by \code{M}.
% The best parallelisation of \mapfoldl{} executes
% \code{M} and \code{F} in parallel with the recursive call.
% \tr{
% \code{M(H, MappedH), F(MappedH, Acc0, Acc1)} in parallel with
% Although all executions of \code{M} are independent
% and need not wait for anything to begin their computation,
% each call to \code{F} must wait until
% the call to \code{M} generates the value of \code{MappedH}.
% Thus there would be no point in executing \code{M} and \code{F} in parallel:
% \code{F} would immediately suspend until \code{M} had produced its result.
% However, the recursive call to \mapfoldl \emph{can} begin in parallel,
% allowing the next call to \code{M} to run in parallel with this iteration.
% }
% The programmer (or an automatic tool) can make this happen
% in the original sequential version of \mapfoldl
% by replacing the comma before the recursive call
% with the parallel conjunction operator \verb'&'.
% % Example.
% The problem is that the execution of a call to \mapfoldlpar{}
% has bad memory behaviour.
% When a context begins execution of a call to \mapfoldl{},
% it begins by creating a spark for the second conjunct
% (which contains the recursive call),
% and executes the first conjunct (which starts with the call to \code{M}).
% If another Mercury engine is available at that time,
% it will pick up and execute the spark for the recursive call,
% itself creating a spark for another recursive call
% and executing the \emph{next} call to \code{M}.
% This will continue until all Mercury engines are in use
% and the newest spark for a recursive call % to \mapfoldlpar{}
% must wait for an % available
% engine.
% When an engine completes execution of \code{M} and \code{F},
% it posts the value of \code{Acc1} into \code{FutureAcc1}.
% Any computations waiting for \code{Acc1} will then be woken up;
% these will be the calls that wait for \code{Acc0} in the next iteration.
% In this case, the woken code will resume execution
% immediately before the call to \code{F}
% in the recursive invocation of \mapfoldlpar{}.
%
% One might hope that after a spark for the recursive call has been created,
% and once \code{M} and \code{F} had completed execution
% and \code{Acc1} has been signalled,
% the context used to execute the first conjunct could be released.
% Unfortunately, it cannot because this context is the one that was running
% when execution entered the parallel conjunction,
% and therefore this is the context whose stacks
% contain the state of the computation outside the parallel conjunction.
% If we allowed this context to be reused,
% then all this state would be lost.
%
% This means that until the base case of the recursion is reached,
% \emph{every} recursive call must have its own complete execution context.
% Since each context contains two stacks,
% it can occupy a rather large amount of memory,
% so it is not practical to simultaneously preserve an execution context
% for each and every recursive call to a tail-recursive predicate.
% Originally, programs which bumped into this problem
% often ran themselves and the operating system out of memory rather quickly,
% because the default size of every det stack was several megabytes.
% To reduce the scope of the problem,
% we made stacks dynamically expandable,
% which allowed us to reduce their initial size,
% but programs with the problem can still run out of memory,
% it just takes more iterations to do so.
% \label{sec:context_limit}
% Our runtime system prevents such crashes
% by imposing a global limit on the number of contexts
% that can be running or suspended at any point:
% if a context is needed to execute a spark
% and allocating the context would breach this limit,
% then the spark will not be executed.
% Eventually, the context that created the spark will execute it on its
% own stack, but this limits the remainder of the recursive computation
% to use only that context, so parallelism is curtailed at that point.
%
% A much better solution is to swap the order of the conjuncts
% in the parallel conjunction
% so that the conjunct containing the recursive call is executed first.
% This means that we will spawn off the non-recursive conjuncts,
% whose contexts \emph{can} be freed when their execution is complete.
% However, since the Mercury mode system requires that
% the producer of a variable precede all its consumers,
% this is possible only if the conjuncts are independent.
% The approach we have taken in this paper
% is to spawn off the non-recursive conjuncts,
% and continue execution of the recursive call
% without swapping the order of the conjuncts.
% We also directly limit the number of contexts that are used in a loop
% to a small multiple of the number of available CPUs.
% Finally, we can arrange for
% the inputs and outputs of the non-recursive conjuncts
% to be stored outside the stack frame of a tail recursive procedure,
% which allows such procedures to run in fixed stack space
% even when executed in parallel.
% In the next section, we explain all of these improvements.
% % If they are dependent, this solution is not applicable,
% % so for such conjunctions we need a completely different solution.
% % We present one in the next section.
% \peter{Not sure how to cover this:}
% \paul{This is not as important as other issues,
% I am not sure it is worth confusing the reader.}
% zs: compiling with and without loop control both have their own unique
% overheads. There is not a clear advantage either way, so the issue is not
% important enough to mention.
%
% There is at most, only one spark on the spark queue,
% which means that parallel work is not abundant.
% There are likely to be more context switches.
% These problems are secondary to the memory consumption problems.
% However, loop control also fixes them ensuring that loop control has lower
% overhead than parallel conjunctions.
\section{The loop control transformation}
\label{sec:lc_transformation}
\status{This section is ready for someone to check.}
\begin{figure}[tb]
\begin{verbatim}
map_foldl_par(M, F, L, FutureAcc0, Acc) :-
lc_create_loop_control(LC),
map_foldl_par_lc(LC, M, F, L, FutureAcc0, Acc).
map_foldl_par_lc(LC, M, F, L, FutureAcc0, Acc) :-
(
L = [],
% The base case.
wait_future(FutureAcc0, Acc0),
Acc = Acc0,
lc_finish(LC)
;
L = [H | T],
new_future(FutureAcc1),
lc_wait_free_slot(LC, LCslot),
lc_spawn_off(LC, LCslot, (
M(H, MappedH),
wait_future(FutureAcc0, Acc0),
F(MappedH, Acc0, Acc1),
signal_future(FutureAcc1, Acc1),
lc_join_and_terminate(LCslot, LC)
)),
map_foldl_par_lc(LC, M, F, T,
FutureAcc1, Acc)
).
\end{verbatim}
%\vspace{2mm}
\caption{\mapfoldlpar after the loop control transformation}
\label{fig:map_foldl_transformed}
%\vspace{-1\baselineskip}
\end{figure}
The main aim of loop control is to set an upper bound
on the number of contexts that a loop may use,
regardless of how many iterations of the loop may be executed,
without limiting the amount of parallelism available.
The loops we are concerned about
are procedures that we call right recursive:
procedures in which the recursive execution path
ends in a parallel conjunction
whose last conjunct contains the recursive call.
A right recursive procedure may be tail recursive, or it may not be:
the recursive call could be followed by other code
either within the last conjunct, or after the whole parallel conjunction.
Programmers have long tended to write loops whose last call is recursive
in order to benefit from tail recursion.
\paul{My recursion type analysis does not yet separate right and left
recursion,
so I cannot actually say how common it is.}
Therefore,
right recursion is very common;
most parallel conjunctions in recursive procedures are right recursive.
To guarantee the imposition of an upper bound
on the number of contexts created during one of these loops,
we associate with each loop a data structure
that has a fixed number of slots,
and require each iteration of the loop that would spawn off a goal
to reserve a slot for the context of each spawned-off computation.
This slot is marked as in-use until that spawned-off computation finishes,
at which time it becomes available for use by another iteration.
This scheme requires us to use two separate predicates:
the first sets up the data structure
(which we call the \emph{loop control} structure)
and the second actually performs the loop.
The rest of the program knows only about the first predicate;
the second predicate is only ever called from the first predicate
and from itself.
Figure~\ref{fig:map_foldl_transformed} shows what these predicates look like.
In Section~\ref{sec:lc_structs},
we describe the loop control structure and the operations on it;
in Section~\ref{sec:lc_trans},
we give the algorithm that does the transformation;
in Section~\ref{sec:lc_tailrec},
we discuss its interaction with tail recursion optimisation.
\subsection{Loop control structures}
\label{sec:lc_structs}
\begin{figure}
\begin{verbatim}
typedef struct MR_LoopControl_Struct MR_LoopControl;
typedef struct MR_LoopControlSlot_Struct MR_LoopControlSlot;
struct MR_LoopControlSlot_Struct
{
MR_Context *MR_lcs_context;
MR_bool MR_lcs_is_free;
};
struct MR_LoopControl_Struct
{
volatile MR_Integer MR_lc_outstanding_workers;
MR_Context* volatile MR_lc_master_context;
volatile MR_Lock MR_lc_master_context_lock;
volatile MR_bool MR_lc_finished;
/*
** MR_lc_slots MUST be the last field, since in practice, we treat
** the array as having as many slots as we need, adding the size of
** all the elements except the first to sizeof(MR_LoopControl) when
** we allocate memory for the structure.
*/
unsigned MR_lc_num_slots;
MR_LoopControlSlot MR_lc_slots[1];
};
\end{verbatim}
\caption{Loop control structure}
\label{fig:loop_control_structure}
\end{figure}
The loop control structure,
shown in Figure~\ref{fig:loop_control_structure},
contains the following fields:
\begin{description}
\item[\code{MR\_lc\_slots}]
is an array of slots, each of which contains a boolean and a pointer.
The boolean says whether the slot is free,
and if it is not,
the pointer points to the context that is currently occupying it.
When the occupying context finishes,
the slot is marked as free again,
but the pointer remains in the slot
to make it easier (and faster) for the next computation that uses that slot
to find a free context to reuse.
Therefore we cannot encode the boolean in the pointer being \NULL or non-\NULL.
Although this is the most significant field in the structure it is last so that
the array can be stored with the data structure, and an extra memory
dereference can be avoided.
This is the last field in the structure even though it is the most significant field.
% (The description of the \code{lc\_wait\_free\_slot(LC)} operation below
% will show why cannot we encode the boolean
% in the pointer being null/non-null.)
\item[\code{MR\_lc\_num\_slots}]
stores the number of slots in the array.
\item[\code{MR\_lc\_outstanding\_workers}]
is the count of the number of slots that are currently in use.
\item[\code{MR\_lc\_master\_context}]
is a possibly null pointer to the \emph{master} context,
the context that created this structure,
and the context that will spawn off all of the iterations.
This slot will point to the master context whenever it is sleeping,
and will be \NULL at all other times.
\item[\code{MR\_lc\_master\_context\_lock}]
a mutex that is used to protect access to MR\_lc\_master\_context.
The other fields are protected using atomic instructions;
we will describe them when we show the code for the loop control procedures.
% \zoltan{Should we explain how accesses to some fields
% can dispense with the mutex?}
% \paul{We decided not to explain this}
\item[\code{MR\_lc\_finished}]
A boolean flag that says whether the loop has finished or not.
It is initialised to false, and is set to true
as the first step of the \lcfinish operation.
%\item[\code{MR\_lc\_free\_slot\_hint}]
%contains an index into the array of free slots.
%It indicates which slot may be free;
%we use it to speed up the search for a free slot in the average case.
%\zoltan{we need an argument for WHY this is a speedup,
%but that argument does not belong here,
%since it depends on the behaviour of operations we have not described yet.
%Alternatively, we can completely avoid mentioning the hint field.}
%\paul{I think it is easier to not mention it, it is optimal in some cases and
%neither optimal nor pessimal in other cases.}
\end{description}
\noindent
The finished flag
is not strictly needed for the correctness of the following operations,
but it can help the loop control code cleanup at the end of a loop more
quickly.
In the following description of the primitive operations
on the loop control structure,
\LC is a reference to the whole of a loop control structure,
while \LCS is an index into the array of slots stored within \LC.
\begin{description}
\item[\code{LC = lc\_create\_loop\_control()}]
This operation creates a new loop control structure,
and initialises its fields.
The number of slots in the array in the structure
will be a small multiple of the number of cores in the system.
The multiplier is configurable
by setting an environment variable when the program is run.
%\begin{algorithm}[tbp]
%\begin{algorithmic}
%\Procedure{MR\_lc\_wait\_free\_slot}{$lc$, $retry\_label$}
% %unsigned hint, offset, i;
%
% \If{$lc.MR\_lc\_outstanding\_workers = lc.MR\_lc\_num\_slots$}
% \State MR\_aquire\_lock($lc.MR\_lc\_master\_context\_lock$)
% \If{$lc.MR\_lc\_outstanding\_workers = lc.MR\_lc\_num\_slots$}
% %MR\_Context *ctxt;
% \State /* Only commit to sleeping while holding the lock. */
% %so retest the outstanding worker count.
% \State $ctxt \gets$ MR\_ENGINE($MR\_eng\_this\_context$)
% \State MR\_save\_context($ctxt$)
% \State $ctxt.MR\_ctxt\_resume \gets retry\_label$
% %\State $ctxt.MR\_ctxt\_resume\_owner\_engine \gets$ MR\_ENGINE($MR_eng_id$)
% \State $lc.MR\_lc\_master\_context \gets ctxt$
% %MR_CPU_SFENCE;
% \State MR\_release\_lock($lc.MR\_lc\_master\_context\_lock$)
% \State MR\_ENGINE($MR\_eng\_this\_context$) $\gets$ \NULL
% \State MR\_idle()
% \EndIf
% \State MR\_release\_lock($lc.MR\_lc\_master\_context\_lock$)
% \EndIf
%
% \State $hint \gets lc.MR\_lc\_free\_slot\_hint$
%
% \For{$offset \gets 0$ to $lc.MR\_lc\_num\_slots$}
% \State $i \gets (hint + offset) \bmod lc.MR\_lc\_num\_slots$
% \If{$lc.MR\_lc\_slots[i].MR\_lcs\_is\_free$}
% \State $lc.MR\_lc\_slots[i].MR\_lcs\_is\_free \gets false$
% \State $lc.MR\_lc\_free\_slot\_hint \gets
% (i + 1) \bmod lc.MR\_lc\_num\_slots$
% \State MR\_atomic\_inc\_int($lc.MR\_lc\_outstanding\_workers$)
% \State $lcs\_idx \gets i$
% \State \Break
% \EndIf
% \EndFor
%
% \If{$lc.MR\_lc\_slots[i].MR\_lcs\_context = $\NULL}
% %\Comment Allocate a new context.
% \State $lc.MR\_lc\_slots[i].MR\_lcs\_context \gets$
% MR\_create\_context()
% %\State
% % $lc.MR\_lc\_slots[i].MR\_lcs\_context.MR\_ctxt\_thread\_local\_mutables
% % \gets MR\_THREAD\_LOCAL\_MUTABLES$
% \EndIf
%
% \State reset\_context\_stack\_ptr($lc.MR\_lc\_slots[i].MR\_lcs\_context$)
%
% \State \Return $lcs\_idx$
%\EndProcedure
%\end{algorithmic}
%\caption{\lcwaitfreeslot}
%\label{alg:lc_free_slot}
%\end{algorithm}
\item[\code{LCslot = lc\_wait\_free\_slot(LC)}]
% This operation tests \linebreak[3] whether \LC{} has any free slots.
% This hyphenation improves this paragraph, I have swapped one evil for
% another.
% This operation tests whe\-ther \LC{} has any free slots.
This operation tests whether \LC{} has any free slots.
If it does not, the operation suspends until a slot becomes available.
When some slots are available, either immediately or after a wait,
the operation chooses one of the free slots, marks it in use,
fills in its context pointer and returns its index.
It can get the context to point to
from the last previous user of the slot,
from a global list of free contexts,
(in both cases it gets contexts which have been used previously
by computations that have terminated earlier),
or by allocating a new context
(which typically happens only soon after startup).
% I have edited this so that it does not refer to sparks, since they are not used
% with loop control.
\item[\code{lc\_spawn\_off(LC, LCslot, CodeLabel)}]
This operation sets up the context in the loop control slot,
and then puts it on the global runqueue,
where any engine looking for work can find it.
Setup of the context consists of initialising the context's parent stack
pointer to point to its master's stack frame,
and the context's resume instruction pointer to the value of \code{CodeLabel}.
% \paul{Possibly:
% Setup consists of creating a stack frame on the context's stack and copying
% the relevant values from the master's stack frame onto the worker-context's.
% Also, the context's resume instruction pointer will be initialised to
% the value of \code{CodeLabel}.
% }
\item[\code{lc\_join\_and\_terminate(LC, LCslot)}]
This operation marks the slot named by \LCS{} in \LC{} as available again.
It then terminates the context executing it,
allowing the engine that was running it to look for other work.
\item[\code{lc\_finish(LC)}]
This operation is executed by the master context when we know
that this loop will not spawn off any more work packages.
It suspends its executing context
until all the slots in \LC{} become free.
This will happen only when all the goals spawned off by the loop
have terminated.
This is necessary to ensure that
all variables produced by the recursive call
that are \emph{not} signalled via futures
have in fact had values generated for them.
A variable generated by a parallel conjunct
that is consumed by a later parallel conjunct will be signalled via a future,
but if the variable is consumed only by code after the parallel conjunction,
then it is made available by writing its value directly in its stack
slot.
Therefore such variables can exist
only if the original predicate had code after the parallel conjunction;
for example, map over lists
must perform a construction after the recursive call.
This barrier is the only barrier in the loop and it is executed just once;
in comparison, the normal parallel conjunction execution mechanism
executes one barrier in each iteration of the loop.
\end{description}
\zoltan{Should we give the pseudo-code of the operations?}
\paul{XXX: I have given the pseudo-code for one of these, if I have time I'll add
pseudo-code for the others.}
\noindent
See Figure~\ref{fig:map_foldl_transformed}
for an example of how we use these operations.
Note in particular that in this transformed version of \mapfoldl{},
the spawned-off computation contains the calls to \var{M} and \var{F},
with the main thread of execution making the recursive call.
This is the first step in preserving tail recursion optimisation.
% Some of these operations also perform some scheduling;
% we will discuss that later.
\picfigure{lc_context_usage}{Loop control context usage}
Figure~\ref{fig:lc_context_usage} shows a visual representation of context
usage when using loop control;
it should be compared with Figure~\ref{fig:linear_context_usage2}.
As before, this is how contexts are likely to be used on a four processor
system
when using a multiplier of two so that eight slots are used;
minor differences in the execution times of each task and similar variables
will mean that no execution will look as regular as in the figures.
In the Figure~\ref{fig:lc_context_usage},
we can see that a total of eight contexts are created and four are in use at
a time.
When the loop begins,
the master thread performs a few iterations, executing \lcwaitfreeslot and
\lcspawnoff.
This creates all the contexts and adds them to the runqueue,
but only the first four contexts can be executed as there are only four
processors.
Once those contexts finish their work,
they execute \lcjoinandterminate.
Each call to \lcjoinandterminate marks the relevant slot in the loop control
structure as free,
allowing the master context to spawn off more work using the free slot.
Meanwhile, the other four contexts are now able to execute their work.
This continues until the loop is finished, at which point \lcfinish releases
all the contexts.
\subsection{The loop control transformation}
\label{sec:lc_trans}
Our algorithm for transforming procedures to use loop control
is shown in
Algorithms~\ref{alg:transform_alg},~\ref{alg:reccases_alg} and~\ref{alg:basecases_alg}.
\begin{algorithm}[tbp]
\begin{algorithmic}
\Procedure{loop\_control\_transform}{$OrigProc$}
\State $OrigGoal \gets$ body($OrigProc$)
\State $RecParConjs \gets$ set of parallel conjunctions
in $OrigGoal$ that contain recursive calls
\BigIf
\algcondition{1}{$OrigProc$ is directly but not mutually recursive
(HO calls are assumed not to create recursion), and}
\algcondition{2}{$OrigGoal$ has at most one recursive call
on all possible execution paths, and}
\algcondition{3}{$OrigGoal$ has determinism \ddet, and}
\algcondition{4}{no recursive call is within a disjunction,
a scope that changes the determinism of a goal,
a negation, or the condition of a if-then-else, and}
\algcondition{5}{no member of $RecParConjs$ is nested within
another parallel conjunction, and}
\algcondition{6}{every recursive call is inside
the last conjunct of a member of $RecParConjs$, and}
\algcondition{7}{every execution path through
one of these last conjuncts
makes exactly one recursive call}
\BigIfThen
\State $LC \gets$ create\_new\_variable()
\State $LCGoal \gets$ the call
`lc\_create\_loop\_control($LC$)'
\State $LoopProcName \gets$ a new unique predicate name
\State $OrigArgs \gets$ arg\_list($OrigProc$)
\State $LoopArgs \gets$ [$LC$] ++ $OrigArgs$
\State $CallLoopGoal \gets$ the call
`LoopProcName($LoopArgs$)'
\State $NewProcBody \gets$ the conjunction `$LCGoal,~CallLoopGoal$'
\State $NewProc \gets OrigProc$ with its body replaced
by $NewProcBody$
\State \parbox{0.98\textwidth}{
\begin{tabbing}
$LoopGoal \gets$ create\_loop\_goal(\=$OrigGoal$,
$OrigProcName$, $LoopProcName$, \\
\>$RecParConjs$, $LC$)
\end{tabbing}}
\State $LoopProc \gets \text{new\_procedure}\left(
\begin{tabular}{l}
$LoopProcName$\code{(}$LoopArgs$\code{) :-} \\
\code{~~~~}$LoopGoal$\code{.}
\end{tabular} \right)$
\State $NewProcs \gets$ [$NewProc,~LoopProc$]
\BigIfElse
\State $NewProcs \gets$ [$OrigProc$]
\EndBigIfElse
\State \Return $NewProcs$
\EndProcedure
\end{algorithmic}
%\vspace{2mm}
\caption{The top level of the transformation algorithm}
\label{alg:transform_alg}
%\vspace{-1\baselineskip}
\end{algorithm}
\begin{algorithm}[tbp]
\begin{algorithmic}
\Procedure{create\_loop\_goal}{$OrigGoal$, $OrigProcName$, $LoopProcName$,
$RecParConjs$, $LC$}
\State $LoopGoal \gets OrigGoal$
\For{$RecParConj \in RecParConjs$}
\State $RecParConj$ has the form $`Conjunct_1~\&~\ldots~\&~Conjunct_n$'
for some $n$
\For{$i \gets 1$ to $n-1$}
\Comment This does not visit the last goal in $RecParConj$
\State $LCSlot_i \gets$ create\_new\_variable()
\State $WaitGoal_i \gets$ the call
`lc\_wait\_free\_slot($LC$, $LCSlot_i$)'
\State $JoinGoal_i \gets$ the call
`lc\_join\_and\_terminate($LC$, $LCSlot_i$)'
\State \parbox{0.7\textwidth}{
\begin{tabbing}
$SpawnGoal_i \gets$ \=a goal that spawns off the sequential
conjunction \\
\>`$Conjunct_i, JoinGoal_i$' as a work package
\end{tabbing}
}
\State $Conjunct_i \gets$ the sequential conjunction
`$WaitGoal_i, SpawnGoal_i$'
\EndFor
\State $Conjunct_n' \gets Conjunct_n$
\For{each recursive call $RecCall$ in $Conjunct_n'$}:
\State $RecCall$ has the form `$OrigProcName(Args)$'
\State $RecCall' \gets$ the call
`$LoopProcName$([$LC$] ++ $Args$)'
\State \textbf{replace} $RecCall$ with $RecCall'$ in $Conjunct_n'$
\EndFor
\State \parbox{0.7\textwidth}{
\begin{tabbing}
$Replacement \gets$ \=the flattened form
of the sequential conjunction \\
\>`$Conjunct_1',~\ldots,~Conjunct_n'$'
\end{tabbing}}
\State \textbf{replace} $RecParConj$ in $LoopGoal$ with $Replacement$
\EndFor
\State $LoopGoal' \gets$ put\_barriers\_in\_base\_cases($LoopGoal$,
$RecParConjs$, $LoopProcName$, $LC$)
\State \Return $LoopGoal'$
\EndProcedure
\end{algorithmic}
%\vspace{2mm}
\caption{Algorithm for transforming the recursive cases}
\label{alg:reccases_alg}
%\vspace{-1\baselineskip}
\end{algorithm}
Algorithm~\ref{alg:transform_alg} shows the top level of the algorithm,
which is mainly concerned with testing
whether the loop control transformation is applicable to a given procedure,
and creating the interface procedure if it is.
We impose conditions (1) and (2) because we need to ensure
that every loop we start for \var{OrigProc} is finished exactly once,
by the call to \lcfinish we insert into its base cases.
If \var{OrigProc} is mutually recursive with some other procedure,
then the recursion may terminate in a base case of the other procedure,
which our algorithm does not transform.
Additionally, if \var{OrigProc} has some execution path on which it calls
itself twice,
then the second call may continue executing loop iterations
after a base case reached through the first call has finished the loop.
We impose conditions (3) and (4) because the Mercury implementation
does not support the parallel execution of code that is not deterministic.
We do not want a recursive call to be called twice because
some code between the entry point of \var{OrigProc} and the recursive call
succeeded twice,
and we do not want a recursive call to be backtracked into because
some code between the recursive call and the exit point of \var{OrigProc}
has failed.
These conditions prevent both of those situations.
We impose condition (5) because we do not want another instance of loop control,
or an instance of the normal parallel conjunction execution mechanism,
to interfere with this instance of loop control.
We impose condition (6) for two reasons.
First, the structure of our transformation requires right recursive code:
we could not terminate the loop in base case code
if the call that lead to that code
was followed by any part of an earlier loop iteration.
Second, allowing recursion to sometimes occur
outside the parallel conjunctions we are trying to optimise
would unnecessarily complicate the algorithm.
(We do believe that it should be possible to extend our algorithm
to handle recursive calls made outside of parallel conjunctions.)
% We impose condition (7) to simplify the construction of a correctness argument
% in favor of the proposition that the two algorithms
% that transform the recursive calls and the bases cases respectively
% (which are shown in Figures \ref{fig:reccases_alg}
% and \ref{fig:basecases_alg})
% do not interfere in each other's operation.
We impose condition (7) to ensure that
our algorithm for transforming base cases
(Algorithm~\ref{alg:basecases_alg})
does not have to process goals that have already been processed
by our algorithm for transforming recursive calls
(Algorithm~\ref{alg:reccases_alg}).
If the transformation is applicable, we apply it.
The transformed original procedure has only one purpose:
to initialise the loop control structure.
Once that is done, it passes a reference to that structure to \var{LoopProc},
the procedure that does the actual work.
The argument list of \var{LoopProc}
is the argument list of \var{OrigProc}
plus the \LC variable that holds the reference to the loop control structure.
The code of \var{LoopProc} is derived from the code of \var{OrigProc}.
Some execution paths in this code include a recursive call; some do not.
The execution paths that contain a recursive call
are transformed by Algorithm~\ref{alg:reccases_alg};
the execution paths that do not
are transformed by Algorithm~\ref{alg:basecases_alg}.
We start with Algorithm~\ref{alg:reccases_alg}.
Due to condition (6),
every recursive call in \var{OrigGoal}
will be inside the last conjunct a parallel conjunction,
and the main task of \createloopgoal
is to iterate over and transform these parallel conjunctions.
(It is possible that some parallel conjunctions do not contain recursive calls;
\createloopgoal will leave these untouched.)
% The twin aims of our transformation are to
% (a) spawn off each parallel conjunct before the final recursive conjunct,
% in order to generate work for other cores to do, and
% (b) limit the number of work packages spawned off by the loop at any one time,
% in order to limit memory consumption.
The main aim of the loop control transformation is
to limit the number of work packages spawned off by the loop at any one time,
in order to limit memory consumption.
The goals we want to spawn off
%as work packages that other cores can pick up and execute
are all the conjuncts before the final recursive conjunct.
(Without loop control, we would spawn off all the \emph{later} conjuncts.)
The first half of the main loop in \createloopgoal
therefore generates code that creates and makes available each work package
only after it obtains a slot for it in the loop control structure,
waiting for a slot to become available if necessary.
We make the spawned-off computation free that slot when it finishes.
To implement the spawning off process,
we extended the internal representation of Mercury goals
with a new kind of scope.
The only one shown in the abstract syntax in
Figure~\ref{fig:abstractsyntax}
%Section~\ref{sec:backgnd_mercury}
was the existential quantification scope
($some~[X_1,\ldots,X_n]~G$),
but the Mercury implementation had several other kinds of scopes already,
though none of those are relevant for this dissertation.
We call the new kind of scope the spawn-off scope,
and we make $SpawnGoal_i$ be a scope goal of this kind.
When the code generator processes such scopes,
it
\begin{itemize}
\item
generates code for the goal inside the scope
(which will end with a call to \lcjoinandterminate),
\item
allocates a new label,
\item
puts the new label in front of that code,
\item
puts this labelled code aside so that
later it can be added to the end of the current procedure's code, and
\item
inserts into the instruction stream a call to \lcspawnoff
that specifies that the spawned-off computation should start execution
at the label of the set-aside code.
The other arguments of \lcspawnoff come from the scope kind.
\end{itemize}
\noindent
Since we allocate a loop slot \var{LCSlot}
just before we spawn off this computation,
waiting for a slot to become available if needed,
and free the slot once this computation has finished executing,
the number of computations that have been spawned-off by this loop
and which have not yet been terminated
cannot exceed the number of slots in the loop control structure.
\begin{algorithm}[tbp]
\begin{algorithmic}
\Procedure{put\_barriers\_in\_base\_cases}{$LoopGoal$,
$RecParConjs$, $LoopProcName$, $LC$}
\If{$LoopGoal$ is a parallel conjunction in $RecParConjs$}
\Comment{case 1}
\State $LoopGoal' \gets LoopGoal$
\ElsIf{there no call to $LoopProcName$ in $LoopGoal$}
\Comment{case 2}
\State $FinishGoal \gets$ the call `lc\_finish($LC$)'
\State $LoopGoal' \gets$ the sequential conjunction
`$LoopGoal,~FinishGoal$'
\Else
\Comment{case 3}
\Switch{goal\_type($LoopGoal$)}
\Case{`ite($C$, $T$, $E$)'}
\State $T' \gets$ put\_barriers\_in\_base\_cases($T$,
$RecParConjs$, $LoopProcName$, $LC$)
\State $E' \gets$ put\_barriers\_in\_base\_cases($E$,
$RecParConjs$, $LoopProcName$, $LC$)
\State $LoopGoal' \gets$ `ite($C$, $T'$, $E'$)'
\EndCase
\Case{`switch($V$, [$Case_1$, \ldots, $Case_N$])'}
\For{$i \gets 1 to N$}
\State $Case_i \gets$ `case($FunctionSymbol_i$, $Goal_i$)'
\State \parbox{0.7\textwidth}{
\begin{tabbing}
$Goal_i' \gets$ \=put\_barriers\_in\_base\_cases($Goal_i$,
$RecParConjs$, $LoopProcName$,\\
\>$LC$)
\end{tabbing}}
\State $Case_i' \gets$ `case($FunctionSymbol_i$, $Goal_i'$)'
\EndFor
\State $LoopGoal' \gets$ `switch($V$, [$Case_1'$, \ldots, $Case_N'$])'
\EndCase
\Case{`$Conj_1$, \ldots $Conj_N$'}
\Comment Sequential conjunction
\State $i \gets 1$
\While{$Conj_i$ does not contain a call to $LoopProcName$}
\State $i \gets i + 1$
\EndWhile
\State $Conj_i' \gets$ put\_barriers\_in\_base\_cases($Conj_i$,
$RecParConjs$, $LoopProcName$, $LC$)
\State $LoopGoal' \gets LoopGoal$ with
$Conj_i$ replaced with $Conj_i'$
\EndCase
\Case{`some($Vars$, $SubGoal$)'}
\Comment Existential quantification
\State \parbox{0.7\textwidth}{
\begin{tabbing}
$SubGoal' \gets$ \=put\_barriers\_in\_base\_cases($SubGoal$,
$RecParConjs$,\\
\>$LoopProcName$, $LC$)
\end{tabbing}}
\State $LoopGoal' \gets$ `some($Vars$, $SubGoal'$)'
\EndCase
\Case{a call `$ProcName$($Args$)'}
\If{$ProcName = OrigProcName$}
\State $LoopGoal' \gets$ the call `$LoopProcName$([$LC$] ++ $Args$)'
\Else
\State $LoopGoal' \gets LoopGoal$
\EndIf
\EndCase
\EndSwitch
\EndIf
\State \Return $LoopGoal'$
\EndProcedure
\end{algorithmic}
%\vspace{2mm}
\caption{Algorithm for transforming the base cases}
\label{alg:basecases_alg}