\documentclass{beamer} \usetheme{Darmstadt} \usepackage[english]{babel} \usepackage[latin1]{inputenc} \setbeamercovered{transparent} \title{Case Studies in Parallelizing Compilers} % \author{Ting-Jun Fan \and Gerard Medioni \and Ramakant Nevatia} % there has to be a better way \author{Aly Merchant} \date{10th April 2006} \begin{document} \frame{\titlepage} \frame{\tableofcontents} \section{Overview} \frame{\frametitle{Hypothetical Compiler} \includegraphics[height=7cm]{hypo.png} } \frame{\frametitle{Loops 1} \includegraphics[height=7cm]{loop1.png} } \frame{\frametitle{Loops 2} \includegraphics[height=7cm]{loop2.png} } \section{SUIF Implementation} \frame{\frametitle{SUIF} } \frame{\frametitle{Data Analysis}\begin{itemize} \item "basic" data dependence test \item formulates an integer programming problem \item analyzes arrays and scalars for privatization \end{itemize}}\frame{\frametitle{Loop nests}\begin{itemize} \item loop nests are critical \item loops may be nested across function boundaries \item inlining is possible, but inlining large blocks of code scares compilers \end{itemize}}\frame{\frametitle{Interprocedural Analysis}\begin{itemize} \item analyze functions for a set of side effects \item clones functions based on call site context \item only clone when necessary (based on context) \end{itemize}}\frame{\frametitle{Memory Problems}\begin{itemize} \item communication: sharing with coherent caches \item capacity: large working sets \item associativity: contention for a few cache lines \item line size: false sharing \end{itemize}}\frame{\frametitle{Prefetching}\begin{itemize} \item hide latency of cache misses \item SUIF will insert prefetching code \end{itemize}}\frame{\frametitle{Data Reuse}\begin{itemize} \item affine partitioning \item focus on maximizing data reuse \item reduce communication and working set size \end{itemize}}\frame{\frametitle{Locality}\begin{itemize} \item loop blocking \item tries to group data into contiguous segments \item improve spatial locality and reduce false sharing \item uses data permutation and strip-mining to achieve its goals \end{itemize}}\frame{\frametitle{Data Transformations} \includegraphics[height=7cm]{suifloop.png}} \frame{\frametitle{OS hints}\begin{itemize} \item SUIF tries to direct the OS's page allocation \item knows about each processor's working set and access patterns \item tries to make data physically contiguous \end{itemize}}\frame{\frametitle{SUIF 1 Passes}\begin{itemize} \item Constant Propagation (`porky -const-prop'). \item Scalarize Array Accesses (`porky -scalarize'). \item Forward Propagation (`porky -forward-prop'). \item Normalization (`predep -normalize'). \item Induction Variable Detection (`porky -ivar', `porky -know-bounds'). \item Constant Folding (`porky -fold'). \item Scalar Privatization Analysis (`moo -Psce'). \item Reduction Recognition (`reduction'). \item Dependence Preprocessing (`predep -presc'). \item Parallelism and Locality Optimizer (`skweel -T'). \item Parallel Code Generator (`pgen'). \end{itemize}}\frame{\frametitle{SUIF 1 Capabilities}\begin{itemize} \item preprocessing handles common nonlinear accesses \item loop transformations: interchange, reversal, skewing and tiling \item handles imperfect loop nests \section{SUIF Results} \end{itemize}}\frame{\frametitle{Results}\begin{itemize} \item use two benchmark suites \begin{itemize} \item SPECfp95 \item NAS \end{itemize} \end{itemize}}\frame{\frametitle{SPECfp95} \includegraphics[height=7cm]{spec.png}} \frame{\frametitle{NAS} \includegraphics[height=7cm]{nas.png}} \frame{\frametitle{Breakdown}\begin{itemize} \item Baseline: \begin{itemize} \item intraprocedural analysis \item scalar optimizations \end{itemize} \item Coarse grain: \begin{itemize} \item interprocedural analysis \item array privatization \end{itemize} \item Memory \begin{itemize} \item data partitioning and permutation \end{itemize} \end{itemize}} \frame{\frametitle{Breakdown} \includegraphics[width=9cm]{suifparts.png}} \frame{\frametitle{Performance}\begin{itemize} \item 13 of 18 programs show noticeable speedups \item of 5 failing cases 3 are attributed to fine grained parallelism \item only 2 are not parallelizable \end{itemize}}\frame{\frametitle{SUIF vs KAP}\begin{itemize} \item SUIF has performance results for SUIF1 vs KAP for {\tt Perfect} and {\tt NAS} benchmarks \end{itemize}} % \frame{\frametitle{SUIF vs KAP Perfect} % \begin{verbatim} % Perfect Parallelized Loops KAP loops % Club KAP SUIF By KAP SUIF found % total total Both only only by SUIF % ------------------------------------------------------ % ADM (APS) 141 134 132 9 2 94 \% % SPICE (CSS) 8 30 8 0 22 100 \% % QCD (LGS) 85 85 84 1 1 99 \% % MDG (LWS) 29 26 24 5 2 83 \% % TRACK (MTS) 48 48 48 0 0 100 \% % BDNA (NAS) 103 105 94 9 11 95 \% % OCEAN (OCS) 65 60 59 6 1 91 \% % DYFESM (SDS) 90 93 86 4 7 100 \% % MG3D (SMS) 38 33 33 5 0 87 \% % ARC2D (SRS) 136 136 132 4 4 100 \% % FLO52 (TFS) 64 64 64 0 0 100 \% % TRFD (TIS) 22 25 22 0 3 100 \% % SPEC77 (WSS) 202 195 193 9 2 96 \% % ------------------------------------------------------ % Total 1031 1034 979 52 55 96 \% % Average 79.3 79.5 75.3 4.0 4.2 96 \% % \end{verbatim} } % \frame{\frametitle{SUIF vs KAP NAS} % \begin{verbatim} % NAS Parallelized Loops KAP loops % Benchmarks KAP SUIF By KAP SUIF found % total total Both only only by SUIF % ------------------------------------------------------ % appbt 92 90 89 3 1 97 \% % applu 67 72 62 5 10 93 \% % appsp 80 87 74 6 13 93 \% % buk 2 4 2 0 2 100 \% % cgm 14 16 14 0 2 100 \% % embar 1 5 1 0 4 100 \% % fftpde 13 14 10 3 4 77 \% % mgrid 12 15 11 1 4 92 \% % ------------------------------------------------------ % Total 281 303 263 18 41 94 \% % Average 35.1 37.9 32.9 2.2 5.1 94 \% % \end{verbatim} } \section{Polaris Implementation} \frame{\frametitle{Polaris} Polaris: Automatic Parallelization of Conventional Fortran Programs } \frame{\frametitle{Analysis}\begin{itemize} \item Equality test \item GCD test \item Range test (nonlinear access) \end{itemize}}\frame{\frametitle{Inlining and Constant Propagation}\begin{itemize} \item inlining is guided by heuristics \item alternatively values from call sites can be propagated through the called function \item functions are cloned for specific values of the arguments \end{itemize}}\frame{\frametitle{Privatization}\begin{itemize} \item scalar privatization \item array privatization \begin{itemize} \item similar idea to scalar case \item if all accesses to elements within an iteration write first, then read the array can be privatized \end{itemize} \end{itemize}}\frame{\frametitle{Reduction}\begin{itemize} \item can identify reductions of the form: \\ \begin{tt} DO i=1,n \\ sum = sum + A(B(i)) \end{tt} \item selects between a locking strategy and dividing up the reduction across every processor \end{itemize}}\frame{\frametitle{Induction Variables}\begin{itemize} \item variables which can be derived from the loop iteration \\ \begin{tt} j=0 \\ DO i=1,n \\ j = j+2 \\ u(j) = j \end{tt} \item analysis of the loop requires j to be in terms of the loop index \item Polaris expands to handle the case where j is a function of more than one index \end{itemize}} \section{Polaris Results} \frame{\frametitle{Results}\begin{itemize} \item uses benchmarks from three sources \item SPECfp95 \item Perfect \item NCSA programs \end{itemize}}\frame{\frametitle{PFA vs Polaris} \includegraphics[height=7cm]{c1.png}} \frame{\frametitle{Polaris Components} \includegraphics[height=7cm]{c2.png}} \frame{\frametitle{Improvements}\begin{itemize} \item remove unnecessary analysis \item dependence and privatization could be determined at runtime \item more use of loop transformations \end{itemize}} \section{Thoughts} \frame{\frametitle{Polaris Philosophy}\begin{itemize} \item portability is important \item need to have compilers which make extensive analysis \item example: HPF's data directives require knowledge of the system \end{itemize}}\frame{\frametitle{Polaris Philosophy}\begin{itemize} \item need to test with real programs \item use analysis to direct compiler efforts \end{itemize}}\frame{\frametitle{Complete Automation}\begin{itemize} \item both compilers are fully automated \item HPF was useful and successful \item some recent projects suggest that full automation may not be necessary or desirable \end{itemize}} \frame{\frametitle{Complete Automation} \includegraphics[height=7cm]{mvsm.png}} \frame{\frametitle{Source to Source vs Complete Compilation}\begin{itemize} \item SUIF generates executable code \item Polaris mentions providing information through markup \item both note that extensive information is critical at several stages \end{itemize}}\frame{\frametitle{Where They Are Now: SUIF}\begin{itemize} \item SUIF2 was developed \item SUIF is generic enough to support research other than parallelism \item some recent work (1999,2000) looked into interactive parallelism \begin{itemize} \item Parascope was ahead of its time? \end{itemize} \end{itemize}}\frame{\frametitle{Where They Are Now: Polaris}\begin{itemize} \item on a page labelled {\tt polaris-old.html} \item Polaris research group launched \item group is looking at OpenMP in addition to some other automated compilers \item some later work seemed to be looking at runtime analysis \end{itemize}} \frame{\frametitle{SUIF vs Polaris}\begin{itemize} \item Polaris claims to deal with real programs \item optimizations seem very specific \item SUIF covers more traditional areas: loop and data optimizations \item many of these optimizations have proven themselves through HPF \end{itemize}} \end{document}