1# This BibTex is provided as an example/test case for bb4dw Dokuwiki Plugin 2# This file intentionally uses various Bibtex formatting and styles! 3 4@TECHREPORT{Cann89, 5 AUTHOR = {D.C. Cann}, 6 EMAIL = {}, 7 TITLE = {{C}ompilation {T}echniques for {H}igh {P}erformance {A}pplicative 8 {C}omputation}, 9 INSTITUTION = {Lawrence Livermore National Laboratory}, 10 ADDRESS = {LLNL, Livermore California}, 11 TYPE = {}, 12 NUMBER = {CS-89-108}, 13 YEAR = 1989, 14 NOTE = {}, 15 FTP = {}, 16 KEYWORDS = {}, 17 CONTENTS = {A comprehensive introduction on SISAL's internal structure. 18 Includes a detailled description of the enhanced "update in place" 19 mechanism used by SISAL.} , 20 TOPICS = {Sisal} 21} 22@INPROCEEDINGS{IPDPS2018, author={T. {Macht} and C. {Grelck}}, booktitle={2019 IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, title={SAC Goes Cluster: Fully Implicit Distributed Computing}, year={2019}, volume={}, number={}, pages={996-1006},} 23 24@ARTICLE{ ScholzJFP03, 25 AUTHOR = {Sven-Bodo Scholz}, 26 TITLE = {{Single} {Assignment} {C} --- 27 Efficient Support for High-Level Array Operations in a Functional Setting}, 28 JOURNAL = {Journal of Functional Programming}, 29 YEAR = {2003}, 30 VOLUME = {13}, 31 NUMBER = {6}, 32 PAGES = {1005--1059}, 33 NOTE = {}, 34 CONTENTS = {Canonical reference for SAC}, 35 DOI = {10.1017/S0956796802004458}, 36 ABSTRACT = {This paper presents a novel approach for integrating arrays with access time O(1) into functional languages. It introduces n-dimensional arrays combined with a type system that supports hierarchies of array types with varying shape information as well as a shape- invariant form of array comprehension called with-loop. Together, these constructs allow for a programming style similar to that of array programming languages such as Apl. We use Single Assignment C (SaC), a functional C-variant aimed at numerical applications that is based on the proposed design, to demonstrate that programs written in that style can be compiled to code whose runtime performance is competitive with that of hand-optimized Fortran programs. However, essential prerequisites for such performance figures are a shape inference system integrated in the type system as well as several high-level optimizations. Most notably of these is With Loop Folding, an optimization technique for eliminating intermediate arrays.}, 37 CATEGORY = {Journal}, 38 TOPICS = {SAC} 39} 40 41@ARTICLE{ GrelSchoIJPP06, 42 AUTHOR = {Clemens Grelck and Sven-Bodo Scholz}, 43 TITLE = {{SAC}: A Functional Array Language for Efficient Multithreaded Execution}, 44 JOURNAL = {International Journal of Parallel Programming}, 45 YEAR = 2006, 46 VOLUME = {34}, 47 NUMBER = {4}, 48 PAGES = {383--427}, 49 NOTE = {}, 50 CONTENTS = {[ISSN: 0885-7458 (Paper) 1573-7640 (Online)]}, 51 DOI = {10.1007/s10766-006-0018-x}, 52 ABSTRACT = {We give an in-depth introduction to the design of our functional array programming language SaC, the main aspects of its compilation into host machine code, and its parallelisation based on multi-threading. The language design of SaC aims at combining high-level, compositional array programming with fully automatic resource management for highly productive code development and maintenance. We outline the compilation process that maps SaC programs to computing machinery. Here, our focus is on optimisation techniques that aim at restructuring entire applications from nested compositions of general fine-grained operations into specialised coarse-grained operations. We present our implicit parallelisation technology for shared memory architectures based on multi-threading and discuss further optimisation opportunities on this level of code generation. Both optimisation and parallelisation rigorously exploit the absence of side-effects and the explicit data flow characteristic of a functional setting.}, 53 TOPICS = {SAC}, 54 CATEGORY = {Journal}, 55 AFFIL = {ctca} 56 57} 58 59@misc{Sakharnykh2017, 60 author = "{Nikolay Sakharnykh}", 61 title = "{Maximizing Unified Memory Performance in CUDA}", 62 year = "2017", 63 howpublished = "\url{https://devblogs.nvidia.com/maximizing-unified-memory-performance-cuda/}", 64 note = "[Online; 29-May-2019]" 65} 66@misc{cuda10.1doc, 67 author = "{NVIDIA Corporation}", 68 title = "{CUDA Toolkit Documentation v10.1.168}", 69 year = "2019", 70 howpublished = "\url{https://web.archive.org/web/20190523173815/https://docs.nvidia.com/cuda/archive/10.1/}", 71 note = "[WayBack Machine; 02-Nov-2019]" 72} 73@misc{cudaConcurrency2011, 74 author = "{Steve Rennich}", 75 title = "{CUDA C/C++ Streams and Concurrency}", 76 year = "2011", 77 howpublished = "\url{http://on-demand.gputechconf.com/gtc-express/2011/presentations/StreamsAndConcurrencyWebinar.pdf}", 78 note = "[Online; 03-Nov-2019]" 79} 80@misc{cudaUnifiedMem2018, 81 author = "{Nikolay Sakharnykh}", 82 Title = "{Everything You Need To Know About Unified Memory}", 83 year = "2018", 84 howpublished = "\url{http://on-demand.gputechconf.com/gtc/2018/presentation/s8430-everything-you-need-to-know-about-unified-memory.pdf}", 85 note = "[Online; 03-Nov-2019]" 86} 87@article{HARTMANN2019304, 88 title = "GPUart - An application-based limited preemptive GPU real-time scheduler for embedded systems", 89 journal = "Journal of Systems Architecture", 90 volume = "97", 91 pages = "304---319", 92 year = "2019", 93 issn = "1383-7621", 94 doi = "https://doi.org/10.1016/j.sysarc.2018.10.005", 95 author = "Christoph Hartmann and Ulrich Margull", 96 keywords = "Real-time scheduling, Limited preemption, Graphics processing unit (GPU), GPU resource management, Embedded systems, Automotive", 97 abstract = "Emerging technologies like autonomous driving entail computational intense software solutions. More and more companies accelerate their embedded applications by General Purpose Computing on the Graphics Processing Unit (GPGPU), in order to overcome those computational demands. Unfortunately, Graphics Processing Units (GPUs) severely lack real-time capability, for example controllable preemption support, which limits their applicability in the embedded domain. We therefore present GPUart, a framework for GPU real-time scheduling. GPUart focuses on embedded systems and requires neither hardware nor driver stack extensions. We propose a software-only approach for preemption, based on the fixed preemption point strategy. In contrast to prior work, GPUart enables preemption inside a thread block by adding fixed preemption points. We further propose a portable high-level resource management concept to enable gang scheduling on GPUs. GPUart can schedule GPU workload either under the Gang-Earliest Deadline First (EDF) or Gang-Fixed Task Priority (FTP) policy. A case-study on Nvidia Tegra X1, using real-world engine management applications from Audi AG and Continental Automotive GmbH, shows that only up to 0.28% additional global memory is required to enable interruptible thread blocks. GPUart reduces the worst observed response times by a factor of up to 221, leading to response times without deadline misses." 98} 99@article{grelck_2005, 100 author={Grelck, Clemens}, 101 title={Shared memory multiprocessor support for functional array processing in SAC}, 102 volume={15}, 103 doi={10.1017/S0956796805005538}, 104 number={3}, 105 journal={Journal of Functional Programming}, 106 publisher={Cambridge University Press}, 107 year={2005}, 108 pages={353–401} 109} 110@InProceedings{ScholzIFL1997, 111 author = {Sven-Bodo Scholz}, 112 title = {With-loop-folding in Sac --- Condensing Consecutive Array Operations}, 113 booktitle = {Implementation of Functional Languages, 9th International Workshop (IFL'97), St. Andrews, UK, Selected Papers}, 114 year = {1998}, 115 editor = {Chris Clack and Tony Davie and Kevin Hammond}, 116 volume = {1467}, 117 series = {Lecture Notes in Computer Science}, 118 pages = {72--92}, 119 publisher = {Springer}, 120 abstract = {This paper introduces a new compiler optimization called With-loop-folding. It is based on a special loop construct, the with-loop, which in the functional language SAC (for Single Assignment C) serves as a versatile vehicle to describe array operations on an elementwise basis. A general mechanism for combining two of these With-loops into a single loop construct is presented. This mechanism constitutes a powerful tool when it comes to generate efficiently executable code from high-level array specifications. By means of a few examples it is shown that even complex nestings of array operations similar to those available in Apl can be transformed into single loop operations which are similar to hand-optimized With-loop specifications. As a consequence, the way a complex array operation is combined from primitive array operations does not affect the runtime performance of the compiled code, i.e., the programmer is liberated from the burden to take performance considerations into account when specifying complex array operations.}, 121 category = {core,design,opt}, 122 doi = {10.1007/BFb0055425}, 123 isbn = {978-3-540-64849-9}, 124 pubaddress = {Berlin, Heidelberg, Germany}, 125 topics = {SAC,Avoiding Temporaries,Implementation of Arrays}, 126 url = {wlf-st-andrews-97.pdf}, 127} 128@inproceedings{jingGPU2011, 129 author = {Jing Guo and Jeyarajan Thiyagalingam and Sven-Bodo Scholz}, 130 title = {Breaking the Gpu Programming Barrier with the Auto-parallelising Sac Compiler}, 131 booktitle = {6th Workshop on Declarative Aspects of Multicore Programming (DAMP'11), Austin, USA}, 132 year = {2011}, 133 pages = {15--24}, 134 publisher = {ACM Press}, 135 doi = {10.1145/1926354.1926359}, 136} 137@inproceedings{jingGPU2009, 138 author = {Jing Guo and 139 Jeyarajan Thiyagalingam and 140 Sven{-}Bodo Scholz}, 141 editor = {Zolt{\'{a}}n Horv{\'{a}}th and 142 Vikt{\'{o}}ria Zs{\'{o}}k and 143 Peter Achten and 144 Pieter W. M. Koopman}, 145 title = {Towards Compiling {SAC} to {CUDA}}, 146 booktitle = {Proceedings of the Tenth Symposium on Trends in Functional Programming, 147 {TFP} 2009, Kom{\'{a}}rno, Slovakia, June 2-4, 2009}, 148 series = {Trends in Functional Programming}, 149 volume = {10}, 150 pages = {33--48}, 151 publisher = {Intellect}, 152 year = {2009}, 153 timestamp = {Tue, 04 Jun 2013 08:01:28 +0200}, 154 biburl = {https://dblp.org/rec/conf/sfp/GuoTS09.bib}, 155 bibsource = {dblp computer science bibliography, https://dblp.org} 156} 157@inproceedings{emr2018, 158author = {Vie\ss{}mann, Hans-Nikolai and \v{S}inkarovs, Artjoms and Scholz, Sven-Bodo}, 159title = {Extended Memory Reuse: An Optimisation for Reducing Memory Allocations}, 160year = {2018}, 161isbn = {9781450371438}, 162publisher = {ACM}, 163address = {New York, NY, USA}, 164doi = {10.1145/3310232.3310242}, 165booktitle = {Proceedings of the 30th Symposium on Implementation and Application of Functional Languages}, 166pages = {107–118}, 167numpages = {12}, 168keywords = {memory management, compiler optimisation, reference counting}, 169location = {Lowell, MA, USA}, 170series = {IFL 2018} 171} 172@inproceedings{Halide, 173author = {Ragan-Kelley, Jonathan and Barnes, Connelly and Adams, Andrew and Paris, Sylvain and Durand, Fr\'{e}do and Amarasinghe, Saman}, 174title = {Halide: A Language and Compiler for Optimizing Parallelism, Locality, and Recomputation in Image Processing Pipelines}, 175year = {2013}, 176isbn = {9781450320146}, 177publisher = {ACM}, 178address = {New York, NY, USA}, 179doi = {10.1145/2491956.2462176}, 180abstract = {Image processing pipelines combine the challenges of stencil computations and stream programs. They are composed of large graphs of different stencil stages, as well as complex reductions, and stages with global or data-dependent access patterns. Because of their complex structure, the performance difference between a naive implementation of a pipeline and an optimized one is often an order of magnitude. Efficient implementations require optimization of both parallelism and locality, but due to the nature of stencils, there is a fundamental tension between parallelism, locality, and introducing redundant recomputation of shared values.We present a systematic model of the tradeoff space fundamental to stencil pipelines, a schedule representation which describes concrete points in this space for each stage in an image processing pipeline, and an optimizing compiler for the Halide image processing language that synthesizes high performance implementations from a Halide algorithm and a schedule. Combining this compiler with stochastic search over the space of schedules enables terse, composable programs to achieve state-of-the-art performance on a wide range of real image processing pipelines, and across different hardware architectures, including multicores with SIMD, and heterogeneous CPU+GPU execution. From simple Halide programs written in a few hours, we demonstrate performance up to 5x faster than hand-tuned C, intrinsics, and CUDA implementations optimized by experts over weeks or months, for image processing applications beyond the reach of past automatic compilers.}, 181booktitle = {Proceedings of the 34th ACM SIGPLAN Conference on Programming Language Design and Implementation}, 182pages = {519–530}, 183numpages = {12}, 184keywords = {redundant computation, gpu, compiler, vectorization, image processing, parallelism, autotuning, optimization, domain specific language, locality}, 185location = {Seattle, Washington, USA}, 186series = {PLDI '13} 187} 188@inproceedings{Futhark, 189author = {Henriksen, Troels and Serup, Niels G. W. and Elsman, Martin and Henglein, Fritz and Oancea, Cosmin E.}, 190title = {Futhark: Purely Functional GPU-Programming with Nested Parallelism and in-Place Array Updates}, 191year = {2017}, 192isbn = {9781450349888}, 193publisher = {ACM}, 194address = {New York, NY, USA}, 195doi = {10.1145/3062341.3062354}, 196abstract = { Futhark is a purely functional data-parallel array language that offers a machine-neutral programming model and an optimising compiler that generates OpenCL code for GPUs. This paper presents the design and implementation of three key features of Futhark that seek a suitable middle ground with imperative approaches. First, in order to express efficient code inside the parallel constructs, we introduce a simple type system for in-place updates that ensures referential transparency and supports equational reasoning. Second, we furnish Futhark with parallel operators capable of expressing efficient strength-reduced code, along with their fusion rules. Third, we present a flattening transformation aimed at enhancing the degree of parallelism that (i) builds on loop interchange and distribution but uses higher-order reasoning rather than array-dependence analysis, and (ii) still allows further locality-of-reference optimisations. Finally, an evaluation on 16 benchmarks demonstrates the impact of the language and compiler features and shows application-level performance competitive with hand-written GPU code. }, 197booktitle = {Proceedings of the 38th ACM SIGPLAN Conference on Programming Language Design and Implementation}, 198pages = {556–571}, 199numpages = {16}, 200keywords = {GPGPU, compilers, functional language, parallel}, 201location = {Barcelona, Spain}, 202series = {PLDI 2017} 203} 204@InProceedings{GrelSchoTrojIFL03, 205 author = {Clemens Grelck and Sven-Bodo Scholz and Kai Trojahner}, 206 title = {With-loop Scalarization: Merging Nested Array Operations}, 207 booktitle = {Implementation of Functional Languages, 15th International Workshop (IFL'03), Edinburgh, Scotland, UK, Revised Selected Papers}, 208 year = {2004}, 209 editor = {Phil Trinder and Greg Michaelson}, 210 volume = {3145}, 211 series = {Lecture Notes in Computer Science}, 212 publisher = {Springer}, 213 category = {design, opt}, 214 doi = {10.1007/978-3-540-27861-0_8}, 215 pubaddress = {Berlin, Heidelberg, Germany}, 216 topics = {SAC,Avoiding Temporaries}, 217 url = {WLSMNAO.pdf}, 218} 219@inproceedings{HiCUDA, 220 author = {Han, Tianyi David and Abdelrahman, Tarek S.}, 221 title = {{HiCUDA: A High-Level Directive-Based Language for GPU Programming}}, 222 year = {2009}, 223 isbn = {9781605585178}, 224 publisher = {ACM}, 225 address = {New York, NY, USA}, 226 doi = {10.1145/1513895.1513902}, 227 abstract = {The Compute Unified Device Architecture (CUDA) has become a de facto standard for programming NVIDIA GPUs. However, CUDA places on the programmer the burden of packaging GPU code in separate functions, of explicitly managing data transfer between the host memory and various components of the GPU memory, and of manually optimizing the utilization of the GPU memory. Practical experience shows that the programmer needs to make significant code changes, which are often tedious and error-prone, before getting an optimized program. We have designed hiCUDA, a high-level directive-based language for CUDA programming. It allows programmers to perform these tedious tasks in a simpler manner, and directly to the sequential code. Nonetheless, it supports the same programming paradigm already familiar to CUDA programmers. We have prototyped a source-to-source compiler that translates a hiCUDA program to a CUDA program. Experiments using five standard CUDA bechmarks show that the simplicity and flexibility hiCUDA provides come at no expense to performance.}, 228 booktitle = {Proceedings of 2nd Workshop on General Purpose Processing on Graphics Processing Units}, 229 pages = {52–61}, 230 numpages = {10}, 231 keywords = {GPGPU, data parallel programming, CUDA}, 232 location = {Washington, D.C., USA}, 233 series = {GPGPU-2} 234} 235@INPROCEEDINGS{LIFTIR, 236 author={M. {Steuwer} and T. {Remmelg} and C. {Dubach}}, 237 booktitle={2017 IEEE/ACM International Symposium on Code Generation and Optimization (CGO)}, 238 title={{LIFT: A functional data-parallel IR for high-performance GPU code generation}}, 239 year={2017}, 240 volume={}, 241 number={}, 242 pages={74-85}, 243 doi={10.1109/CGO.2017.7863730} 244} 245@article{fitzgerald1996, 246 author = {Fitzgerald, Steven M. and Oldehoeft, Rodney R.}, 247 title = {Update-in-place Analysis for True Multidimensional Arrays}, 248 journal = {Sci. Program.}, 249 issue_date = {Summer 1996}, 250 volume = {5}, 251 number = {2}, 252 month = Jul, 253 year = {1996}, 254 issn = {1058-9244}, 255 pages = {147--160}, 256 numpages = {14}, 257 doi = {10.1155/1996/493673}, 258 acmid = {226640}, 259 publisher = {IOS Press}, 260 address = {Amsterdam, The Netherlands, The Netherlands}, 261} 262@inproceedings{Guo2014impact, 263 author = {Guo, Jing and Bernecky, Robert and 264 Thiyagalingam, Jeyarajan and Scholz, Sven-Bodo}, 265 title = {Polyhedral Methods for Improving Parallel Update-in-Place}, 266 booktitle = {Proceedings of the 4th International Workshop on Polyhedral Compilation Techniques}, 267 editor = {Rajopadhye, Sanjay and Verdoolaege, Sven}, 268 year = 2014, 269 month = Jan, 270 address = {Vienna, Austria} 271} 272@INPROCEEDINGS{chien2019, 273 author={S. {Chien} and I. {Peng} and S. {Markidis}}, 274 booktitle={2019 IEEE/ACM Workshop on Memory Centric High Performance Computing (MCHPC)}, 275 title={Performance Evaluation of Advanced Features in CUDA Unified Memory}, 276 year={2019}, 277 volume={}, 278 number={}, 279 pages={50-57}, 280 doi={10.1109/MCHPC49590.2019.00014} 281} 282@InProceedings{dyntaskgpu2013, 283 author={Chatterjee, Sanjay and Grossman, Max and Sb{\^i}rlea, Alina and Sarkar, Vivek}, 284 editor={Rajopadhye, Sanjay and Mills Strout, Michelle}, 285 title={{Dynamic Task Parallelism with a GPU Work-Stealing Runtime System}}, 286 booktitle={Languages and Compilers for Parallel Computing}, 287 year={2013}, 288 publisher={Springer Berlin Heidelberg}, 289 address={Berlin, Heidelberg}, 290 pages={203--217}, 291 abstract={NVIDIA's Compute Unified Device Architecture (CUDA) enabled GPUs become accessible to mainstream programming. Abundance of simple computational cores and high memory bandwidth make GPUs ideal candidates for data parallel applications. However, its potential for executing applications that combine task and data parallelism has not been explored in detail. CUDA does not provide a viable interface for creating dynamic tasks and handling load balancing issues. Any support for such has to be orchestrated entirely by the CUDA programmer today.}, 292 isbn={978-3-642-36036-7}, 293 doi={10.1007/978-3-642-36036-7_14} 294} 295@article{async2012, 296 title = {{Performance models for asynchronous data transfers on consumer Graphics Processing Units}}, 297 journal = {Journal of Parallel and Distributed Computing}, 298 volume = {72}, 299 number = {9}, 300 pages = {1117--1126}, 301 year = {2012}, 302 note = {Accelerators for High-Performance Computing}, 303 issn = {0743-7315}, 304 doi = {10.1016/j.jpdc.2011.07.011}, 305 author = {Juan Gómez-Luna and José María González-Linares and José Ignacio Benavides and Nicolás Guil}, 306 keywords = {GPU, CUDA, Asynchronous transfers, Streams, Overlapping of communication and computation}, 307 abstract = {Graphics Processing Units (GPU) have impressively arisen as general-purpose coprocessors in high performance computing applications, since the launch of the Compute Unified Device Architecture (CUDA). However, they present an inherent performance bottleneck in the fact that communication between two separate address spaces (the main memory of the CPU and the memory of the GPU) is unavoidable. The CUDA Application Programming Interface (API) provides asynchronous transfers and streams, which permit a staged execution, as a way to overlap communication and computation. Nevertheless, a precise manner to estimate the possible improvement due to overlapping does not exist, neither a rule to determine the optimal number of stages or streams in which computation should be divided. In this work, we present a methodology that is applied to model the performance of asynchronous data transfers of CUDA streams on different GPU architectures. Thus, we illustrate this methodology by deriving expressions of performance for two different consumer graphic architectures belonging to the more recent generations. These models permit programmers to estimate the optimal number of streams in which the computation on the GPU should be broken up, in order to obtain the highest performance improvements. Finally, we have checked the suitability of our performance models with three applications based on codes from the CUDA Software Development Kit (SDK) with successful results.} 308} 309@InProceedings{autocuda2012, 310author={Jung, Hanwoong and Yi, Youngmin and Ha, Soonhoi}, 311editor={Wyrzykowski, Roman and Dongarra, Jack and Karczewski, Konrad and Wa{\'{s}}niewski, Jerzy}, 312title={{Automatic CUDA Code Synthesis Framework for Multicore CPU and GPU Architectures}}, 313booktitle={Parallel Processing and Applied Mathematics}, 314year={2012}, 315publisher={Springer Berlin Heidelberg}, 316address={Berlin, Heidelberg}, 317pages={579--588}, 318abstract={Recently, general purpose GPU (GPGPU) programming has spread rapidly after CUDA was first introduced to write parallel programs in high-level languages for NVIDIA GPUs. While a GPU exploits data parallelism very effectively, task-level parallelism is exploited as a multi-threaded program on a multicore CPU. For such a heterogeneous platform that consists of a multicore CPU and GPU, we propose an automatic code synthesis framework that takes a process network model specification as input and generates a multithreaded CUDA code. With the model based specification, one can explicitly specify both function-level and loop-level parallelism in an application and explore the wide design space in mapping of function blocks and selecting the communication methods between CPU and GPU. The proposed technique is complementary to other high-level methods of CUDA programming.}, 319isbn={978-3-642-31464-3}, 320doi={10.1007/978-3-642-31464-3_59} 321} 322 323@INPROCEEDINGS{uintah2012, 324 author={Q. {Meng} and A. {Humphrey} and M. {Berzins}}, 325 booktitle={2012 SC Companion: High Performance Computing, Networking Storage and Analysis}, 326 title={The uintah framework: a unified heterogeneous task scheduling and runtime system}, 327 year={2012}, 328 volume={}, 329 number={}, 330 pages={2441-2448}, 331 abstract={The development of a new unified, multi-threaded runtime system for the execution of asynchronous tasks on heterogeneous systems is described in this work. These asynchronous tasks arise from the Uintah framework, which was developed to provide an environment for solving a broad class of fluid-structure interaction problems on structured adaptive grids. Uintah has a clear separation between its MPI-free user-coded tasks and its runtime system that ensures these tasks execute efficiently. This separation also allows for complete isolation of the application developer from the complexities involved with the parallelism Uintah provides. While we have designed scalable runtime systems for large CPU core counts, the emergence of heterogeneous systems, with additional on-node accelerators and co-processors presents additional design challenges in terms of effectively utilizing all computational resources on-node and managing multiple levels of parallelism. Our work addresses these challenges for Uintah by the development of new hybrid runtime system and Unified multi-threaded MPI task scheduler, enabling Uintah to fully exploit current and emerging architectures with support for asynchronous, out-of-order scheduling of both CPU and GPU computational tasks. This design coupled with an approach that uses MPI to communicate between nodes, a shared memory model on-node and the use of novel lock-free data structures, has made it possible for Uintah to achieve excellent scalability for challenging fluid-structure problems using adaptive mesh refinement on as many as 256K cores on the DoE Jaguar XK6 system. This design has also demonstrated an ability to run capability jobs on the heterogeneous systems, Keeneland and TitanDev. In this work, the evolution of Uintah and its runtime system is examined in the context of our new Unified multi-threaded scheduler design. The performance of the Unified scheduler is also tested against previous Uintah scheduler and runtime designs over a range of processor core and GPU counts.}, 332 keywords={application program interfaces;data structures;message passing;multi-threading;processor scheduling;shared memory systems;Keeneland;TitanDev;DoE Jaguar XK6 system;adaptive mesh refinement;lock-free data structures;shared memory model on-node;asynchronous out-of-order scheduling;GPU computational tasks;hybrid runtime system;unified multithreaded MPI task scheduler design;coprocessors;on-node accelerators;CPU core counts;MPI-free user-coded tasks;structured adaptive grids;fluid-structure interaction problems;heterogeneous systems;asynchronous task execution;multithreaded runtime system;unified heterogeneous task scheduling;Uintah framework;Graphics processing units;Instruction sets;Runtime;Data warehouses;Computer architecture;Master-slave;Parallel processing}, 333 doi={10.1109/SCC.2012.6674233}, 334 ISSN={}, 335 month=Nov, 336} 337@article{kim2013, 338 author = {Kim, Yooseong and Shrivastava, Aviral}, 339 title = {Memory Performance Estimation of CUDA Programs}, 340 year = {2013}, 341 issue_date = {September 2013}, 342 publisher = {ACM}, 343 address = {New York, NY, USA}, 344 volume = {13}, 345 number = {2}, 346 issn = {1539-9087}, 347 doi = {10.1145/2514641.2514648}, 348 abstract = {CUDA has successfully popularized GPU computing, and GPGPU applications are now used in various embedded systems. The CUDA programming model provides a simple interface to program on GPUs, but tuning GPGPU applications for high performance is still quite challenging. Programmers need to consider numerous architectural details, and small changes in source code, especially on the memory access pattern, can affect performance significantly. This makes it very difficult to optimize CUDA programs. This article presents CuMAPz, which is a tool to analyze and compare the memory performance of CUDA programs. CuMAPz can help programmers explore different ways of using shared and global memories, and optimize their program for efficient memory behavior. CuMAPz models several memory-performance-related factors: data reuse, global memory access coalescing, global memory latency hiding, shared memory bank conflict, channel skew, and branch divergence. Experimental results show that CuMAPz can accurately estimate performance with correlation coefficient of 0.96. By using CuMAPz to explore the memory access design space, we could improve the performance of our benchmarks by 30% more than the previous approach [Hong and Kim 2010].}, 349 journal = {ACM Trans. Embed. Comput. Syst.}, 350 month = Sep, 351 articleno = {21}, 352 numpages = {22}, 353 keywords = {CUDA, memory performance, program optimization, GPGPU, performance estimation} 354} 355@article{choi2020, 356 author = {Choi, Jake and You, Hojun and Kim, Chongam and Young Yeom, Heon and Kim, Yoonhee}, 357 title = {{Comparing unified, pinned, and host/device memory allocations for memory-intensive workloads on Tegra SoC}}, 358 journal = {Concurrency and Computation: Practice and Experience}, 359 keywords = {benchmark, CFD, CUDA, GPU, memory, pinned, Rodinia, unified}, 360 doi = {10.1002/cpe.6018}, 361 year = {2020}, 362 month = Sep, 363 abstract = {Summary Edge computing focuses on processing near the source of the data. Edge computing devices using the Tegra SoC architecture provide a physically distinct GPU memory architecture. In order to take advantage of this architecture, different modes of memory allocation need to be considered. Different GPU memory allocation techniques yield different results in memory usage and execution times of identical applications on Tegra devices. In this article, we implement several GPU application benchmarks, including our custom CFD code with unified, pinned, and normal host/device memory allocation modes. We evaluate and compare the memory usage and execution time of such workloads on edge computing Tegra system-on-chips (SoC) equipped with integrated GPUs using a shared memory architecture, and non-SoC machines with discrete GPUs equipped with distinct VRAM. We discover that utilizing normal memory allocation methods on SoCs actually use double the required memory because of unnecessary device memory copies, despite being physically shared with host memory. We show that GPU application memory usage can be reduced up to 50\%, and that even performance improvements can occur just by replacing normal memory allocation and memory copy methods with managed unified memory or pinned memory allocation.} 364} 365 366@InProceedings{GrelSchoIFL02, 367 author = {Clemens Grelck and Sven-Bodo Scholz}, 368 title = {Axis Control in Sac}, 369 booktitle = {Implementation of Functional Languages, 14th International Workshop (IFL'02), Madrid, Spain, Revised Selected Papers}, 370 year = {2003}, 371 editor = {Ricardo Pe{\~n}a and Thomas Arts}, 372 volume = {2670}, 373 series = {Lecture Notes in Computer Science}, 374 pages = {182--198}, 375 publisher = {Springer}, 376 abstract = {High-level array processing is characterized by the composi-tion of generic operations, which treat all array elements in a uniform way. This paper proposes a mechanism that allows programmers to direct effects of such array operations to non-scalar subarrays of argument ar-rays without sacrificing the high-level programming approach. A versatile notation for axis control is presented, and it is shown how the additional language constructs can be transformed into regular SaC code. Further-more, an optimization technique is introduced which achieves the same runtime performance regardless of whether code is written using the new notation or in a substantially less elegant style employing conventional language features.}, 377 category = {core, opt}, 378 pubaddress = {Berlin, Heidelberg, Germany}, 379 sourceurl = {http://www.isp.mu-luebeck.de/~grelck/publications/axis-control-madrid-02.ps.gz}, 380 topics = {SAC}, 381} 382