1# This BibTex is provided as an example/test case for bb4dw Dokuwiki Plugin
2# This file intentionally uses various Bibtex formatting and styles!
3
4@TECHREPORT{Cann89,
5  AUTHOR      = {D.C. Cann},
6  EMAIL       = {},
7  TITLE       = {{C}ompilation {T}echniques for {H}igh {P}erformance {A}pplicative
8                 {C}omputation},
9  INSTITUTION = {Lawrence Livermore National Laboratory},
10  ADDRESS     = {LLNL, Livermore California},
11  TYPE        = {},
12  NUMBER      = {CS-89-108},
13  YEAR        = 1989,
14  NOTE        = {},
15  FTP         = {},
16  KEYWORDS    = {},
17  CONTENTS    = {A comprehensive introduction on SISAL's internal structure.
18                 Includes a detailled description of the enhanced "update in place"
19                 mechanism used by SISAL.} ,
20  TOPICS   = {Sisal}
21}
22@INPROCEEDINGS{IPDPS2018,  author={T. {Macht} and C. {Grelck}},  booktitle={2019 IEEE International Parallel and Distributed Processing Symposium (IPDPS)},   title={SAC Goes Cluster: Fully Implicit Distributed Computing},   year={2019},  volume={},  number={},  pages={996-1006},}
23
24@ARTICLE{ ScholzJFP03,
25  AUTHOR     = {Sven-Bodo Scholz},
26  TITLE      = {{Single} {Assignment} {C} ---
27                Efficient Support for High-Level Array Operations in a Functional Setting},
28  JOURNAL    = {Journal of Functional Programming},
29  YEAR       = {2003},
30  VOLUME     = {13},
31  NUMBER     = {6},
32  PAGES      = {1005--1059},
33  NOTE       = {},
34  CONTENTS   = {Canonical reference for SAC},
35  DOI        = {10.1017/S0956796802004458},
36  ABSTRACT   = {This paper presents a novel approach for integrating arrays with access time O(1) into functional languages. It introduces n-dimensional arrays combined with a type system that supports hierarchies of array types with varying shape information as well as a shape- invariant form of array comprehension called with-loop. Together, these constructs allow for a programming style similar to that of array programming languages such as Apl. We use Single Assignment C (SaC), a functional C-variant aimed at numerical applications that is based on the proposed design, to demonstrate that programs written in that style can be compiled to code whose runtime performance is competitive with that of hand-optimized Fortran programs. However, essential prerequisites for such performance figures are a shape inference system integrated in the type system as well as several high-level optimizations. Most notably of these is With Loop Folding, an optimization technique for eliminating intermediate arrays.},
37  CATEGORY   = {Journal},
38  TOPICS     = {SAC}
39}
40
41@ARTICLE{ GrelSchoIJPP06,
42  AUTHOR    = {Clemens Grelck and Sven-Bodo Scholz},
43  TITLE     = {{SAC}: A Functional Array Language for Efficient Multithreaded Execution},
44  JOURNAL   = {International Journal of Parallel Programming},
45  YEAR      = 2006,
46  VOLUME    = {34},
47  NUMBER    = {4},
48  PAGES     = {383--427},
49  NOTE      = {},
50  CONTENTS  = {[ISSN: 0885-7458 (Paper) 1573-7640 (Online)]},
51  DOI = {10.1007/s10766-006-0018-x},
52  ABSTRACT  = {We give an in-depth introduction to the design of our functional array programming language SaC, the main aspects of its compilation into host machine code, and its parallelisation based on multi-threading. The language design of SaC aims at combining high-level, compositional array programming with fully automatic resource management for highly productive code development and maintenance. We outline the compilation process that maps SaC programs to computing machinery. Here, our focus is on optimisation techniques that aim at restructuring entire applications from nested compositions of general fine-grained operations into specialised coarse-grained operations. We present our implicit parallelisation technology for shared memory architectures based on multi-threading and discuss further optimisation opportunities on this level of code generation. Both optimisation and parallelisation rigorously exploit the absence of side-effects and the explicit data flow characteristic of a functional setting.},
53  TOPICS    = {SAC},
54  CATEGORY  = {Journal},
55  AFFIL     = {ctca}
56
57}
58
59@misc{Sakharnykh2017,
60    author = "{Nikolay Sakharnykh}",
61    title = "{Maximizing Unified Memory Performance in CUDA}",
62    year = "2017",
63    howpublished = "\url{https://devblogs.nvidia.com/maximizing-unified-memory-performance-cuda/}",
64    note = "[Online; 29-May-2019]"
65}
66@misc{cuda10.1doc,
67    author = "{NVIDIA Corporation}",
68    title = "{CUDA Toolkit Documentation v10.1.168}",
69    year = "2019",
70    howpublished = "\url{https://web.archive.org/web/20190523173815/https://docs.nvidia.com/cuda/archive/10.1/}",
71    note = "[WayBack Machine; 02-Nov-2019]"
72}
73@misc{cudaConcurrency2011,
74    author = "{Steve Rennich}",
75    title = "{CUDA C/C++ Streams and Concurrency}",
76    year = "2011",
77    howpublished = "\url{http://on-demand.gputechconf.com/gtc-express/2011/presentations/StreamsAndConcurrencyWebinar.pdf}",
78    note = "[Online; 03-Nov-2019]"
79}
80@misc{cudaUnifiedMem2018,
81    author = "{Nikolay Sakharnykh}",
82    Title = "{Everything You Need To Know About Unified Memory}",
83    year = "2018",
84    howpublished = "\url{http://on-demand.gputechconf.com/gtc/2018/presentation/s8430-everything-you-need-to-know-about-unified-memory.pdf}",
85    note = "[Online; 03-Nov-2019]"
86}
87@article{HARTMANN2019304,
88    title = "GPUart - An application-based limited preemptive GPU real-time scheduler for embedded systems",
89    journal = "Journal of Systems Architecture",
90    volume = "97",
91    pages = "304---319",
92    year = "2019",
93    issn = "1383-7621",
94    doi = "https://doi.org/10.1016/j.sysarc.2018.10.005",
95    author = "Christoph Hartmann and Ulrich Margull",
96    keywords = "Real-time scheduling, Limited preemption, Graphics processing unit (GPU), GPU resource management, Embedded systems, Automotive",
97    abstract = "Emerging technologies like autonomous driving entail computational intense software solutions. More and more companies accelerate their embedded applications by General Purpose Computing on the Graphics Processing Unit (GPGPU), in order to overcome those computational demands. Unfortunately, Graphics Processing Units (GPUs) severely lack real-time capability, for example controllable preemption support, which limits their applicability in the embedded domain. We therefore present GPUart, a framework for GPU real-time scheduling. GPUart focuses on embedded systems and requires neither hardware nor driver stack extensions. We propose a software-only approach for preemption, based on the fixed preemption point strategy. In contrast to prior work, GPUart enables preemption inside a thread block by adding fixed preemption points. We further propose a portable high-level resource management concept to enable gang scheduling on GPUs. GPUart can schedule GPU workload either under the Gang-Earliest Deadline First (EDF) or Gang-Fixed Task Priority (FTP) policy. A case-study on Nvidia Tegra X1, using real-world engine management applications from Audi AG and Continental Automotive GmbH, shows that only up to 0.28% additional global memory is required to enable interruptible thread blocks. GPUart reduces the worst observed response times by a factor of up to 221, leading to response times without deadline misses."
98}
99@article{grelck_2005,
100    author={Grelck, Clemens},
101    title={Shared memory multiprocessor support for functional array processing in SAC},
102    volume={15},
103    doi={10.1017/S0956796805005538},
104    number={3},
105    journal={Journal of Functional Programming},
106    publisher={Cambridge University Press},
107    year={2005},
108    pages={353–401}
109}
110@InProceedings{ScholzIFL1997,
111  author     = {Sven-Bodo Scholz},
112  title      = {With-loop-folding in Sac --- Condensing Consecutive Array Operations},
113  booktitle  = {Implementation of Functional Languages, 9th International Workshop (IFL'97), St. Andrews, UK, Selected Papers},
114  year       = {1998},
115  editor     = {Chris Clack and Tony Davie and Kevin Hammond},
116  volume     = {1467},
117  series     = {Lecture Notes in Computer Science},
118  pages      = {72--92},
119  publisher  = {Springer},
120  abstract   = {This paper introduces a new compiler optimization called With-loop-folding. It is based on a special loop construct, the with-loop, which in the functional language SAC (for Single Assignment C) serves as a versatile vehicle to describe array operations on an elementwise basis. A general mechanism for combining two of these With-loops into a single loop construct is presented. This mechanism constitutes a powerful tool when it comes to generate efficiently executable code from high-level array specifications. By means of a few examples it is shown that even complex nestings of array operations similar to those available in Apl can be transformed into single loop operations which are similar to hand-optimized With-loop specifications. As a consequence, the way a complex array operation is combined from primitive array operations does not affect the runtime performance of the compiled code, i.e., the programmer is liberated from the burden to take performance considerations into account when specifying complex array operations.},
121  category   = {core,design,opt},
122  doi        = {10.1007/BFb0055425},
123  isbn       = {978-3-540-64849-9},
124  pubaddress = {Berlin, Heidelberg, Germany},
125  topics     = {SAC,Avoiding Temporaries,Implementation of Arrays},
126  url        = {wlf-st-andrews-97.pdf},
127}
128@inproceedings{jingGPU2011,
129  author     = {Jing Guo and Jeyarajan Thiyagalingam and Sven-Bodo Scholz},
130  title      = {Breaking the Gpu Programming Barrier with the Auto-parallelising Sac Compiler},
131  booktitle  = {6th Workshop on Declarative Aspects of Multicore Programming (DAMP'11), Austin, USA},
132  year       = {2011},
133  pages      = {15--24},
134  publisher  = {ACM Press},
135  doi        = {10.1145/1926354.1926359},
136}
137@inproceedings{jingGPU2009,
138  author    = {Jing Guo and
139               Jeyarajan Thiyagalingam and
140               Sven{-}Bodo Scholz},
141  editor    = {Zolt{\'{a}}n Horv{\'{a}}th and
142               Vikt{\'{o}}ria Zs{\'{o}}k and
143               Peter Achten and
144               Pieter W. M. Koopman},
145  title     = {Towards Compiling {SAC} to {CUDA}},
146  booktitle = {Proceedings of the Tenth Symposium on Trends in Functional Programming,
147               {TFP} 2009, Kom{\'{a}}rno, Slovakia, June 2-4, 2009},
148  series    = {Trends in Functional Programming},
149  volume    = {10},
150  pages     = {33--48},
151  publisher = {Intellect},
152  year      = {2009},
153  timestamp = {Tue, 04 Jun 2013 08:01:28 +0200},
154  biburl    = {https://dblp.org/rec/conf/sfp/GuoTS09.bib},
155  bibsource = {dblp computer science bibliography, https://dblp.org}
156}
157@inproceedings{emr2018,
158author = {Vie\ss{}mann, Hans-Nikolai and \v{S}inkarovs, Artjoms and Scholz, Sven-Bodo},
159title = {Extended Memory Reuse: An Optimisation for Reducing Memory Allocations},
160year = {2018},
161isbn = {9781450371438},
162publisher = {ACM},
163address = {New York, NY, USA},
164doi = {10.1145/3310232.3310242},
165booktitle = {Proceedings of the 30th Symposium on Implementation and Application of Functional Languages},
166pages = {107–118},
167numpages = {12},
168keywords = {memory management, compiler optimisation, reference counting},
169location = {Lowell, MA, USA},
170series = {IFL 2018}
171}
172@inproceedings{Halide,
173author = {Ragan-Kelley, Jonathan and Barnes, Connelly and Adams, Andrew and Paris, Sylvain and Durand, Fr\'{e}do and Amarasinghe, Saman},
174title = {Halide: A Language and Compiler for Optimizing Parallelism, Locality, and Recomputation in Image Processing Pipelines},
175year = {2013},
176isbn = {9781450320146},
177publisher = {ACM},
178address = {New York, NY, USA},
179doi = {10.1145/2491956.2462176},
180abstract = {Image processing pipelines combine the challenges of stencil computations and stream programs. They are composed of large graphs of different stencil stages, as well as complex reductions, and stages with global or data-dependent access patterns. Because of their complex structure, the performance difference between a naive implementation of a pipeline and an optimized one is often an order of magnitude. Efficient implementations require optimization of both parallelism and locality, but due to the nature of stencils, there is a fundamental tension between parallelism, locality, and introducing redundant recomputation of shared values.We present a systematic model of the tradeoff space fundamental to stencil pipelines, a schedule representation which describes concrete points in this space for each stage in an image processing pipeline, and an optimizing compiler for the Halide image processing language that synthesizes high performance implementations from a Halide algorithm and a schedule. Combining this compiler with stochastic search over the space of schedules enables terse, composable programs to achieve state-of-the-art performance on a wide range of real image processing pipelines, and across different hardware architectures, including multicores with SIMD, and heterogeneous CPU+GPU execution. From simple Halide programs written in a few hours, we demonstrate performance up to 5x faster than hand-tuned C, intrinsics, and CUDA implementations optimized by experts over weeks or months, for image processing applications beyond the reach of past automatic compilers.},
181booktitle = {Proceedings of the 34th ACM SIGPLAN Conference on Programming Language Design and Implementation},
182pages = {519–530},
183numpages = {12},
184keywords = {redundant computation, gpu, compiler, vectorization, image processing, parallelism, autotuning, optimization, domain specific language, locality},
185location = {Seattle, Washington, USA},
186series = {PLDI '13}
187}
188@inproceedings{Futhark,
189author = {Henriksen, Troels and Serup, Niels G. W. and Elsman, Martin and Henglein, Fritz and Oancea, Cosmin E.},
190title = {Futhark: Purely Functional GPU-Programming with Nested Parallelism and in-Place Array Updates},
191year = {2017},
192isbn = {9781450349888},
193publisher = {ACM},
194address = {New York, NY, USA},
195doi = {10.1145/3062341.3062354},
196abstract = { Futhark is a purely functional data-parallel array language that offers a machine-neutral programming model and an optimising compiler that generates OpenCL code for GPUs.  This paper presents the design and implementation of three key features of Futhark that seek a suitable middle ground with imperative approaches.  First, in order to express efficient code inside the parallel constructs, we introduce a simple type system for in-place updates that ensures referential transparency and supports equational reasoning.  Second, we furnish Futhark with parallel operators capable of expressing efficient strength-reduced code, along with their fusion rules.  Third, we present a flattening transformation aimed at enhancing the degree of parallelism that (i) builds on loop interchange and distribution but uses higher-order reasoning rather than array-dependence analysis, and (ii) still allows further locality-of-reference optimisations. Finally, an evaluation on 16 benchmarks demonstrates the impact of the language and compiler features and shows application-level performance competitive with hand-written GPU code. },
197booktitle = {Proceedings of the 38th ACM SIGPLAN Conference on Programming Language Design and Implementation},
198pages = {556–571},
199numpages = {16},
200keywords = {GPGPU, compilers, functional language, parallel},
201location = {Barcelona, Spain},
202series = {PLDI 2017}
203}
204@InProceedings{GrelSchoTrojIFL03,
205  author     = {Clemens Grelck and Sven-Bodo Scholz and Kai Trojahner},
206  title      = {With-loop Scalarization: Merging Nested Array Operations},
207  booktitle  = {Implementation of Functional Languages, 15th International Workshop (IFL'03), Edinburgh, Scotland, UK, Revised Selected Papers},
208  year       = {2004},
209  editor     = {Phil Trinder and Greg Michaelson},
210  volume     = {3145},
211  series     = {Lecture Notes in Computer Science},
212  publisher  = {Springer},
213  category   = {design, opt},
214  doi        = {10.1007/978-3-540-27861-0_8},
215  pubaddress = {Berlin, Heidelberg, Germany},
216  topics     = {SAC,Avoiding Temporaries},
217  url        = {WLSMNAO.pdf},
218}
219@inproceedings{HiCUDA,
220	author = {Han, Tianyi David and Abdelrahman, Tarek S.},
221	title = {{HiCUDA: A High-Level Directive-Based Language for GPU Programming}},
222	year = {2009},
223	isbn = {9781605585178},
224	publisher = {ACM},
225	address = {New York, NY, USA},
226	doi = {10.1145/1513895.1513902},
227	abstract = {The Compute Unified Device Architecture (CUDA) has become a de facto standard for programming NVIDIA GPUs. However, CUDA places on the programmer the burden of packaging GPU code in separate functions, of explicitly managing data transfer between the host memory and various components of the GPU memory, and of manually optimizing the utilization of the GPU memory. Practical experience shows that the programmer needs to make significant code changes, which are often tedious and error-prone, before getting an optimized program. We have designed hiCUDA, a high-level directive-based language for CUDA programming. It allows programmers to perform these tedious tasks in a simpler manner, and directly to the sequential code. Nonetheless, it supports the same programming paradigm already familiar to CUDA programmers. We have prototyped a source-to-source compiler that translates a hiCUDA program to a CUDA program. Experiments using five standard CUDA bechmarks show that the simplicity and flexibility hiCUDA provides come at no expense to performance.},
228	booktitle = {Proceedings of 2nd Workshop on General Purpose Processing on Graphics Processing Units},
229	pages = {52–61},
230	numpages = {10},
231	keywords = {GPGPU, data parallel programming, CUDA},
232	location = {Washington, D.C., USA},
233	series = {GPGPU-2}
234}
235@INPROCEEDINGS{LIFTIR,
236	author={M. {Steuwer} and T. {Remmelg} and C. {Dubach}},
237	booktitle={2017 IEEE/ACM International Symposium on Code Generation and Optimization (CGO)},
238	title={{LIFT: A functional data-parallel IR for high-performance GPU code generation}},
239	year={2017},
240	volume={},
241	number={},
242	pages={74-85},
243	doi={10.1109/CGO.2017.7863730}
244}
245@article{fitzgerald1996,
246 author = {Fitzgerald, Steven M. and Oldehoeft, Rodney R.},
247 title = {Update-in-place Analysis for True Multidimensional Arrays},
248 journal = {Sci. Program.},
249 issue_date = {Summer 1996},
250 volume = {5},
251 number = {2},
252 month = Jul,
253 year = {1996},
254 issn = {1058-9244},
255 pages = {147--160},
256 numpages = {14},
257 doi = {10.1155/1996/493673},
258 acmid = {226640},
259 publisher = {IOS Press},
260 address = {Amsterdam, The Netherlands, The Netherlands},
261}
262@inproceedings{Guo2014impact,
263	author = {Guo, Jing and Bernecky, Robert and
264			Thiyagalingam, Jeyarajan and Scholz, Sven-Bodo},
265	title = {Polyhedral Methods for Improving Parallel Update-in-Place},
266	booktitle = {Proceedings of the 4th International Workshop on Polyhedral Compilation Techniques},
267	editor = {Rajopadhye, Sanjay and Verdoolaege, Sven},
268	year   = 2014,
269	month  = Jan,
270	address = {Vienna, Austria}
271}
272@INPROCEEDINGS{chien2019,
273    author={S. {Chien} and I. {Peng} and S. {Markidis}},
274    booktitle={2019 IEEE/ACM Workshop on Memory Centric High Performance Computing (MCHPC)},
275    title={Performance Evaluation of Advanced Features in CUDA Unified Memory},
276    year={2019},
277    volume={},
278    number={},
279    pages={50-57},
280    doi={10.1109/MCHPC49590.2019.00014}
281}
282@InProceedings{dyntaskgpu2013,
283    author={Chatterjee, Sanjay and Grossman, Max and Sb{\^i}rlea, Alina and Sarkar, Vivek},
284    editor={Rajopadhye, Sanjay and Mills Strout, Michelle},
285    title={{Dynamic Task Parallelism with a GPU Work-Stealing Runtime System}},
286    booktitle={Languages and Compilers for Parallel Computing},
287    year={2013},
288    publisher={Springer Berlin Heidelberg},
289    address={Berlin, Heidelberg},
290    pages={203--217},
291    abstract={NVIDIA's Compute Unified Device Architecture (CUDA) enabled GPUs become accessible to mainstream programming. Abundance of simple computational cores and high memory bandwidth make GPUs ideal candidates for data parallel applications. However, its potential for executing applications that combine task and data parallelism has not been explored in detail. CUDA does not provide a viable interface for creating dynamic tasks and handling load balancing issues. Any support for such has to be orchestrated entirely by the CUDA programmer today.},
292    isbn={978-3-642-36036-7},
293    doi={10.1007/978-3-642-36036-7_14}
294}
295@article{async2012,
296	title = {{Performance models for asynchronous data transfers on consumer Graphics Processing Units}},
297	journal = {Journal of Parallel and Distributed Computing},
298	volume = {72},
299	number = {9},
300	pages = {1117--1126},
301	year = {2012},
302	note = {Accelerators for High-Performance Computing},
303	issn = {0743-7315},
304	doi = {10.1016/j.jpdc.2011.07.011},
305	author = {Juan Gómez-Luna and José María González-Linares and José Ignacio Benavides and Nicolás Guil},
306	keywords = {GPU, CUDA, Asynchronous transfers, Streams, Overlapping of communication and computation},
307	abstract = {Graphics Processing Units (GPU) have impressively arisen as general-purpose coprocessors in high performance computing applications, since the launch of the Compute Unified Device Architecture (CUDA). However, they present an inherent performance bottleneck in the fact that communication between two separate address spaces (the main memory of the CPU and the memory of the GPU) is unavoidable. The CUDA Application Programming Interface (API) provides asynchronous transfers and streams, which permit a staged execution, as a way to overlap communication and computation. Nevertheless, a precise manner to estimate the possible improvement due to overlapping does not exist, neither a rule to determine the optimal number of stages or streams in which computation should be divided. In this work, we present a methodology that is applied to model the performance of asynchronous data transfers of CUDA streams on different GPU architectures. Thus, we illustrate this methodology by deriving expressions of performance for two different consumer graphic architectures belonging to the more recent generations. These models permit programmers to estimate the optimal number of streams in which the computation on the GPU should be broken up, in order to obtain the highest performance improvements. Finally, we have checked the suitability of our performance models with three applications based on codes from the CUDA Software Development Kit (SDK) with successful results.}
308}
309@InProceedings{autocuda2012,
310author={Jung, Hanwoong and Yi, Youngmin and Ha, Soonhoi},
311editor={Wyrzykowski, Roman and Dongarra, Jack and Karczewski, Konrad and Wa{\'{s}}niewski, Jerzy},
312title={{Automatic CUDA Code Synthesis Framework for Multicore CPU and GPU Architectures}},
313booktitle={Parallel Processing and Applied Mathematics},
314year={2012},
315publisher={Springer Berlin Heidelberg},
316address={Berlin, Heidelberg},
317pages={579--588},
318abstract={Recently, general purpose GPU (GPGPU) programming has spread rapidly after CUDA was first introduced to write parallel programs in high-level languages for NVIDIA GPUs. While a GPU exploits data parallelism very effectively, task-level parallelism is exploited as a multi-threaded program on a multicore CPU. For such a heterogeneous platform that consists of a multicore CPU and GPU, we propose an automatic code synthesis framework that takes a process network model specification as input and generates a multithreaded CUDA code. With the model based specification, one can explicitly specify both function-level and loop-level parallelism in an application and explore the wide design space in mapping of function blocks and selecting the communication methods between CPU and GPU. The proposed technique is complementary to other high-level methods of CUDA programming.},
319isbn={978-3-642-31464-3},
320doi={10.1007/978-3-642-31464-3_59}
321}
322
323@INPROCEEDINGS{uintah2012,
324	author={Q. {Meng} and A. {Humphrey} and M. {Berzins}},
325	booktitle={2012 SC Companion: High Performance Computing, Networking Storage and Analysis},
326	title={The uintah framework: a unified heterogeneous task scheduling and runtime system},
327	year={2012},
328	volume={},
329	number={},
330	pages={2441-2448},
331	abstract={The development of a new unified, multi-threaded runtime system for the execution of asynchronous tasks on heterogeneous systems is described in this work. These asynchronous tasks arise from the Uintah framework, which was developed to provide an environment for solving a broad class of fluid-structure interaction problems on structured adaptive grids. Uintah has a clear separation between its MPI-free user-coded tasks and its runtime system that ensures these tasks execute efficiently. This separation also allows for complete isolation of the application developer from the complexities involved with the parallelism Uintah provides. While we have designed scalable runtime systems for large CPU core counts, the emergence of heterogeneous systems, with additional on-node accelerators and co-processors presents additional design challenges in terms of effectively utilizing all computational resources on-node and managing multiple levels of parallelism. Our work addresses these challenges for Uintah by the development of new hybrid runtime system and Unified multi-threaded MPI task scheduler, enabling Uintah to fully exploit current and emerging architectures with support for asynchronous, out-of-order scheduling of both CPU and GPU computational tasks. This design coupled with an approach that uses MPI to communicate between nodes, a shared memory model on-node and the use of novel lock-free data structures, has made it possible for Uintah to achieve excellent scalability for challenging fluid-structure problems using adaptive mesh refinement on as many as 256K cores on the DoE Jaguar XK6 system. This design has also demonstrated an ability to run capability jobs on the heterogeneous systems, Keeneland and TitanDev. In this work, the evolution of Uintah and its runtime system is examined in the context of our new Unified multi-threaded scheduler design. The performance of the Unified scheduler is also tested against previous Uintah scheduler and runtime designs over a range of processor core and GPU counts.},
332	keywords={application program interfaces;data structures;message passing;multi-threading;processor scheduling;shared memory systems;Keeneland;TitanDev;DoE Jaguar XK6 system;adaptive mesh refinement;lock-free data structures;shared memory model on-node;asynchronous out-of-order scheduling;GPU computational tasks;hybrid runtime system;unified multithreaded MPI task scheduler design;coprocessors;on-node accelerators;CPU core counts;MPI-free user-coded tasks;structured adaptive grids;fluid-structure interaction problems;heterogeneous systems;asynchronous task execution;multithreaded runtime system;unified heterogeneous task scheduling;Uintah framework;Graphics processing units;Instruction sets;Runtime;Data warehouses;Computer architecture;Master-slave;Parallel processing},
333	doi={10.1109/SCC.2012.6674233},
334	ISSN={},
335	month=Nov,
336}
337@article{kim2013,
338	author = {Kim, Yooseong and Shrivastava, Aviral},
339	title = {Memory Performance Estimation of CUDA Programs},
340	year = {2013},
341	issue_date = {September 2013},
342	publisher = {ACM},
343	address = {New York, NY, USA},
344	volume = {13},
345	number = {2},
346	issn = {1539-9087},
347	doi = {10.1145/2514641.2514648},
348	abstract = {CUDA has successfully popularized GPU computing, and GPGPU applications are now used in various embedded systems. The CUDA programming model provides a simple interface to program on GPUs, but tuning GPGPU applications for high performance is still quite challenging. Programmers need to consider numerous architectural details, and small changes in source code, especially on the memory access pattern, can affect performance significantly. This makes it very difficult to optimize CUDA programs. This article presents CuMAPz, which is a tool to analyze and compare the memory performance of CUDA programs. CuMAPz can help programmers explore different ways of using shared and global memories, and optimize their program for efficient memory behavior. CuMAPz models several memory-performance-related factors: data reuse, global memory access coalescing, global memory latency hiding, shared memory bank conflict, channel skew, and branch divergence. Experimental results show that CuMAPz can accurately estimate performance with correlation coefficient of 0.96. By using CuMAPz to explore the memory access design space, we could improve the performance of our benchmarks by 30% more than the previous approach [Hong and Kim 2010].},
349	journal = {ACM Trans. Embed. Comput. Syst.},
350	month = Sep,
351	articleno = {21},
352	numpages = {22},
353	keywords = {CUDA, memory performance, program optimization, GPGPU, performance estimation}
354}
355@article{choi2020,
356	author = {Choi, Jake and You, Hojun and Kim, Chongam and Young Yeom, Heon and Kim, Yoonhee},
357	title = {{Comparing unified, pinned, and host/device memory allocations for memory-intensive workloads on Tegra SoC}},
358	journal = {Concurrency and Computation: Practice and Experience},
359	keywords = {benchmark, CFD, CUDA, GPU, memory, pinned, Rodinia, unified},
360	doi = {10.1002/cpe.6018},
361    year = {2020},
362    month = Sep,
363	abstract = {Summary Edge computing focuses on processing near the source of the data. Edge computing devices using the Tegra SoC architecture provide a physically distinct GPU memory architecture. In order to take advantage of this architecture, different modes of memory allocation need to be considered. Different GPU memory allocation techniques yield different results in memory usage and execution times of identical applications on Tegra devices. In this article, we implement several GPU application benchmarks, including our custom CFD code with unified, pinned, and normal host/device memory allocation modes. We evaluate and compare the memory usage and execution time of such workloads on edge computing Tegra system-on-chips (SoC) equipped with integrated GPUs using a shared memory architecture, and non-SoC machines with discrete GPUs equipped with distinct VRAM. We discover that utilizing normal memory allocation methods on SoCs actually use double the required memory because of unnecessary device memory copies, despite being physically shared with host memory. We show that GPU application memory usage can be reduced up to 50\%, and that even performance improvements can occur just by replacing normal memory allocation and memory copy methods with managed unified memory or pinned memory allocation.}
364}
365
366@InProceedings{GrelSchoIFL02,
367  author     = {Clemens Grelck and Sven-Bodo Scholz},
368  title      = {Axis Control in Sac},
369  booktitle  = {Implementation of Functional Languages, 14th International Workshop (IFL'02), Madrid, Spain, Revised Selected Papers},
370  year       = {2003},
371  editor     = {Ricardo Pe{\~n}a and Thomas Arts},
372  volume     = {2670},
373  series     = {Lecture Notes in Computer Science},
374  pages      = {182--198},
375  publisher  = {Springer},
376  abstract   = {High-level array processing is characterized by the composi-tion of generic operations, which treat all array elements in a uniform way. This paper proposes a mechanism that allows programmers to direct effects of such array operations to non-scalar subarrays of argument ar-rays without sacrificing the high-level programming approach. A versatile notation for axis control is presented, and it is shown how the additional language constructs can be transformed into regular SaC code. Further-more, an optimization technique is introduced which achieves the same runtime performance regardless of whether code is written using the new notation or in a substantially less elegant style employing conventional language features.},
377  category   = {core, opt},
378  pubaddress = {Berlin, Heidelberg, Germany},
379  sourceurl  = {http://www.isp.mu-luebeck.de/~grelck/publications/axis-control-madrid-02.ps.gz},
380  topics     = {SAC},
381}
382