BibTeX bibliography supercomputing2002.bib

%%% -*-BibTeX-*-
%%% ====================================================================
%%%  BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.03",
%%%     date            = "25 October 2010",
%%%     time            = "17:23:38 MDT",
%%%     filename        = "supercomputing2002.bib",
%%%                        University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "http://www.math.utah.edu/~beebe",
%%%     checksum        = "15932 2233 13182 122527",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "BibTeX, bibliography, SC2002, Supercomputing
%%%                        2002",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is a complete bibliography of papers
%%%                        published in the proceedings of
%%%                        Supercomputing '2002.
%%%
%%%                        The conference World-Wide Web site is
%%%
%%%                            http://www.sc-2002.org/
%%%
%%%                        The organizers of this conference series
%%%                        maintain a World-Wide Web site at
%%%
%%%                            http://www.supercomp.org/
%%%
%%%                        where pointers to Web pages for the
%%%                        conferences from 1988 to date may be found.
%%%
%%%                        At version 1.03, the year coverage looked
%%%                        like this:
%%%
%%%                             2002 (  68)
%%%
%%%                             InProceedings:   67
%%%                             Proceedings:      1
%%%
%%%                             Total entries:   68
%%%
%%%                        In this bibliography, entries are sorted in
%%%                        order of PDF file numbers.
%%%
%%%                        The on-line electronic proceedings do not
%%%                        contain sequential page numbers, although
%%%                        there is an ISBN assigned for the
%%%                        proceedings.  A pagecount field is given with
%%%                        each entry, extracted from the PDF file: some
%%%                        of the articles lack page numbers altogether,
%%%                        others number pages 1, 2, 3, ...
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility.",
%%%  }
%%% ====================================================================

@Preamble{
    "\ifx \undefined \TM \def \TM {${}^{\sc TM}$} \fi"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:

@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|http://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Publishers and their addresses:

@String{pub-IEEE                = "IEEE Computer Society Press"}

@String{pub-IEEE:adr            = "1109 Spring Street, Suite 300,
                                  Silver Spring, MD 20910, USA"}

%%% ====================================================================
%%% Bibliography entries.

@InProceedings{DeRose:2002:SSI,
  author =       "Luiz DeRose and K. Ekanadham and Jeffrey Hollingsworth
                 and Simone Sbaraglia",
  title =        "{SIGMA}: {A} Simulator Infrastructure to Guide Memory
                 Analysis",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap191.pdf",
  abstract =     "In this paper we present SIGMA (Simulation
                 Infrastructure to Guide Memory Analysis), a new data
                 collection framework and family of cache analysis
                 tools. The SIGMA environment provides detailed cache
                 information by gathering memory reference data using
                 software-based instrumentation. This infrastructure can
                 facilitate quick probing into the factors that
                 influence the performance of an application by
                 highlighting bottleneck scenarios including: excessive
                 cache/TLB misses and inefficient data layouts. The tool
                 can also assist in perturbation analysis to determine
                 performance variations caused by changes to
                 architecture or program. Our validation tests using the
                 SPEC Swim benchmark show that most of the performance
                 metrics obtained with SIGMA are within 1\% of the
                 metrics obtained with hardware performance counters,
                 with the advantage that SIGMA provides performance data
                 on a data structure level, as specified by the
                 programmer.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Lu:2002:CAS,
  author =       "Charng-da Lu and Daniel A. Reed",
  title =        "Compact Application Signatures for Parallel and
                 Distributed Scientific Codes",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap244.pdf",
  abstract =     "Understanding the dynamic behavior of parallel
                 programs is key to developing efficient system software
                 and runtime environments; this is even more true on
                 emerging computational Grids where resource
                 availability and performance can change in
                 unpredictable ways. Event tracing provides details on
                 behavioral dynamics, albeit often at great cost. We
                 describe an intermediate approach, based on curve
                 fitting, that retains many of the advantages of event
                 tracing but with lower overhead. These compact
                 ``application signatures'' summarize the time-varying
                 resource needs of scientific codes from historical
                 trace data. We also developed a comparison scheme that
                 measures similarity between two signatures, both across
                 executions and across execution environments.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Ahn:2002:SAT,
  author =       "Dong H. Ahn and Jeffrey S. Vetter",
  title =        "Scalable Analysis Techniques for Microprocessor
                 Performance Counter Metrics",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap257.pdf",
  abstract =     "Contemporary microprocessors provide a rich set of
                 integrated performance counters that allow application
                 developers and system architects alike the opportunity
                 to gather important information about workload
                 behaviors. Current techniques for analyzing data
                 produced from these counters use raw counts, ratios,
                 and visualization techniques help users make decisions
                 about their application performance. While these
                 techniques are appropriate for analyzing data from one
                 process, they do not scale easily to new levels
                 demanded by contemporary computing systems. Very
                 simply, this paper addresses these concerns by
                 evaluating several multivariate statistical techniques
                 on these datasets. We find that several techniques,
                 such as statistical clustering, can automatically
                 extract important features from the data. These derived
                 results can, in turn, be fed directly back to an
                 application developer, or used as input to a more
                 comprehensive performance analysis environment, such as
                 a visualization or an expert system.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Bailey:2002:HPC,
  author =       "David H. Bailey and David Broadhurst and Yozo Hida and
                 Xiaoye S. Li and Brandon Thompson",
  title =        "High Performance Computing Meets Experimental
                 Mathematics",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Fri Aug 08 11:13:32 2008",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap124.pdf",
  abstract =     "In this paper we describe some novel applications of
                 high performance computing in a discipline now known as
                 experimental mathematics. The paper reviews some recent
                 published work, and then presents some new results that
                 have not yet appeared in the literature. A key
                 technique involved in this research is the PSLQ integer
                 relation algorithm (recently named one of ten
                 algorithms of the century by Computing in Science and
                 Engineering). This algorithm permits one to recognize a
                 numeric constant in terms of the formula that it
                 satisfies. We present a variant of PSLQ that is
                 well-suited for parallel computation, and give several
                 examples of new mathematical results that we have found
                 using it. Two of these computations were performed on
                 highly parallel computers, since they are not feasible
                 on conventional systems. We also describe a new
                 software package for performing arbitrary precision
                 arithmetic, which is required in this research.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Baumgartner:2002:HLA,
  author =       "Gerald Baumgartner and David E. Bernholdt and Daniel
                 Cociorva and Robert Harrison and So Hirata and
                 Chi-Chung Lam and Marcel Nooijen and Russell Pitzer and
                 J. Ramanujam and P. Sadayappan",
  title =        "A High-Level Approach to Synthesis of High-Performance
                 Codes for Quantum Chemistry",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap306.pdf",
  abstract =     "This paper discusses an approach to the synthesis of
                 high-performance parallel programs for a class of
                 computations encountered in quantum chemistry and
                 physics. These computations are expressible as a set of
                 tensor contractions and arise in electronic structure
                 modeling. An overview is provided of the synthesis
                 system, that transforms a high-level specification of
                 the computation into high-performance parallel code,
                 tailored to the characteristics of the target
                 architecture. An example from computational chemistry
                 is used to illustrate how different code structures are
                 generated under different assumptions of available
                 memory on the target computer.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Ding:2002:MOP,
  author =       "Yun He and Chris H. Q. Ding",
  key =          "multidimensional arrays; index reshuffle; vacancy
                 tracking cycles; global exchange; dynamical remapping;
                 MPI; OpenMP; hybrid MPI/OpenMP; SMP cluster.",
  title =        "{MPI} and {OpenMP} Paradigms on Cluster of {SMP}
                 Architectures",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap325.pdf",
  abstract =     "We investigate remapping multi-dimensional arrays on
                 cluster of SMP architectures under OpenMP, MPI, and
                 hybrid paradigms. Traditional method of array transpose
                 needs an auxiliary array of the same size and a copy
                 back stage. We recently developed an in-place method
                 using vacancy tracking cycles. The vacancy tracking
                 algorithm outperforms the traditional 2-array method as
                 demonstrated by extensive comparisons. The independence
                 of vacancy tracking cycles allows efficient
                 parallelization of the in-place method on SMP
                 architectures at node level. Performance of
                 multi-threaded parallelism using OpenMP are tested with
                 different scheduling methods and different number of
                 threads. The vacancy tracking method is parallelized
                 using several parallel paradigms. At node level, pure
                 OpenMP outperforms pure MPI by a factor of 2.76. Across
                 entire cluster of SMP nodes, the hybrid MPI/OpenMP
                 implementation outperforms pure MPI by a factor of
                 4.44, demonstrating the validity of the parallel
                 paradigm of mixing MPI with OpenMP.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Hacker:2002:ESP,
  author =       "Thomas J. Hacker and Brian D. Noble and Brian D.
                 Athey",
  title =        "The Effects of Systemic Packet Loss on Aggregate {TCP}
                 Flows",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap270.pdf",
  abstract =     "The use of parallel TCP connections to increase
                 throughput for bulk transfers is common practice within
                 the high performance computing community. However, the
                 effectiveness, fairness, and efficiency of data
                 transfers across parallel connections is unclear. This
                 paper considers the impact of systemic non-congestion
                 related packet loss on the effectiveness, fairness, and
                 efficiency of parallel TCP transmissions. The results
                 indicate that parallel connections are effective at
                 increasing aggregate throughput, and increase the
                 overall efficiency of the network bottleneck. In the
                 presence of congestion related losses, parallel flows
                 steal bandwidth from other single stream flows. A
                 simple modification is presented that reduces the
                 fairness problems when congestion is present, but
                 retains effectiveness and efficiency.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Pradhan:2002:IEQ,
  author =       "Prashant Pradhan and Tzi-cker Chiueh",
  title =        "Implementation and Evaluation of a {QoS}-Capable
                 Cluster-Based {IP} Router",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap320.pdf",
  abstract =     "A major challenge in Internet edge router design is to
                 support both high packet forwarding performance and
                 versatile and efficient packet processing capabilities.
                 The thesis of this research project is that a cluster
                 of PCs connected by a high-speed system area network
                 provides an effective hardware platform for building
                 routers to be used at the edges of the Internet. This
                 paper describes a scalable and extensible edge router
                 architecture called Panama, which supports a novel
                 aggregate route caching scheme, a real-time link
                 scheduling algorithm whose performance overhead is
                 independent of the number of real-time flows, a highly
                 efficient kernel extension mechanism to safely load
                 networking software extensions dynamically, and an
                 integrated resource scheduler which ensures that
                 real-time flows with additional packet processing
                 requirements still meet their end-to-end performance
                 requirements. This paper describes the implementation
                 and evaluation of the first Panama prototype based on a
                 cluster of PCs and Myrinet.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Dunigan:2002:TTD,
  author =       "Tom Dunigan and Matt Mathis and Brian Tierney",
  title =        "A {TCP} Tuning Daemon",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap151.pdf",
  abstract =     "Many high performance distributed applications require
                 high network throughput but are able to achieve only a
                 small fraction of the available bandwidth. A common
                 cause of this problem is improperly tuned network
                 settings. Tuning techniques, such as setting the
                 correct TCP buffers and using parallel streams, are
                 well known in the networking community, but outside the
                 networking community they are infrequently applied. In
                 this paper, we describe a tuning daemon that uses TCP
                 instrumentation data from the Unix kernel to
                 transparently tune TCP parameters for specified
                 individual flows over designated paths. No
                 modifications are required to the application, and the
                 user does not need to understand network or TCP
                 characteristics.",
  acknowledgement = ack-nhfb,
  keywords =     "autotuning; TCP; high-performance networking; data
                 grids",
}

@InProceedings{Malard:2002:DDH,
  author =       "J. M. Malard and R. D. Stewart",
  title =        "Distributed Dynamic Hash Tables Using {IBM LAPI}",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap247.pdf",
  abstract =     "An asynchronous communication library for accessing
                 and managing dynamic hash tables over a network of
                 Symmetric Multiprocessors (SMP) is presented. A
                 blocking factor is shown experimentally to reduce the
                 variance of the wall clock time. It is also shown that
                 remote accesses to a distributed hash table can be as
                 effective and scalable as the one-sided operations of
                 the low-level communication middleware on an IBM SP.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Swany:2002:MRP,
  author =       "Martin Swany and Rich Wolski",
  title =        "Multivariate Resource Performance Forecasting in the
                 {Network Weather Service}",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap292.pdf",
  abstract =     "This paper describes a new technique in the Network
                 Weather Service for producing multi-variate forecasts.
                 The new technique uses the NWS's univariate forecasters
                 and empirically gathered Cumulative Distribution
                 Functions (CDFs) to make predictions from correlated
                 measurement streams. Experimental results are shown in
                 which throughput is predicted for long TCP/IP transfers
                 from short NWS network probes.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Otoo:2002:DCR,
  author =       "Ekow J. Otoo and Frank Olken and Arie Shoshani",
  title =        "Disk Cache Replacement Algorithm for Storage Resource
                 Managers in Data Grids",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap322.pdf",
  abstract =     "We address the problem of cache replacement policies
                 for Storage Resource Managers (SRMs) that are used in
                 Data Grids. An SRM has a disk storage of bounded
                 capacity that retains some N objects. A replacement
                 policy is applied to determine which object in the
                 cache needs to be evicted when space is needed. We
                 define a utility function for ranking the candidate
                 objects for eviction and then describe an efficient
                 algorithm for computing the replacement policy based on
                 this function. This computation takes time $O(\log N)$.
                 We compare our policy with traditional replacement
                 policies such as Least Frequently Used (LFU), Least
                 Recently Used (LRU), LRU-K, Greedy Dual Size (GDS),
                 etc., using simulations of both synthetic and real
                 workloads of file accesses to tertiary storage. Our
                 simulations of replacement policies account for delays
                 in cache space reservation, data transfer and
                 processing. The results obtained show that our proposed
                 method is the most cost effective cache replacement
                 policy for Storage Resource Managers (SRM).",
  acknowledgement = ack-nhfb,
  keywords =     "file caching; cache replacement algorithm;
                 trace-driven simulation; data staging; storage resource
                 management",
}

@InProceedings{Radovic:2002:ESN,
  author =       "Zoran Radovic and Erik Hagersten",
  title =        "Efficient Synchronization for Nonuniform Communication
                 Architectures",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap221.pdf",
  abstract =     "Scalable parallel computers are often nonuniform
                 communication architectures (NUCAs), where the access
                 time to other processor's caches vary with their
                 physical location. Still, few attempts of exploring
                 cache-to-cache communication locality have been made.
                 This paper introduces a new kind of synchronization
                 primitives (lock-unlock) that favor neighboring
                 processors when a lock is released. This improves the
                 lock handover time as well as access time to the shared
                 data of the critical region. A critical section guarded
                 by our new RH lock takes less than half the time to
                 execute compared with the same critical section guarded
                 by any other lock on our NUCA hardware. The execution
                 time for Raytrace with 28 processors was improved
                 2.23--4.68 times, while global traffic was dramatically
                 decreased compared with all the other locks. The
                 average execution time was improved 7--24\% while the
                 global traffic was decreased 8--28\% for an average
                 over the seven applications studied.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Sistare:2002:UHP,
  author =       "Steven J. Sistare and Christopher J. Jackson",
  title =        "Ultra-High Performance Communication with {MPI} and
                 the {Sun Fire(\TM)} Link Interconnect",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap142.pdf",
  abstract =     "We present a new low-latency system area network that
                 provides the ultra-high bandwidth needed to fuse a
                 collection of large SMP servers into a capability
                 cluster. The network adapter exports a remote shared
                 memory (RSM) model that supports low latency kernel
                 bypass messaging. The Sun\TM{} MPI library uses the RSM
                 interface to implement a highly efficient
                 memory-to-memory messaging protocol in which the
                 library directly manages buffers and data structures in
                 remote memory. This allows flexible allocation of
                 buffer space to active connections, while avoiding
                 resource contention that could otherwise increase
                 latencies. We discuss the characteristics of the
                 interconnect, describe the MPI protocols, and measure
                 the performance of a number of MPI benchmarks. Our
                 results include MPI inter-node bandwidths of almost 3
                 Gigabytes per second and MPI ping-pong latencies as low
                 as 3.7 microseconds.",
  acknowledgement = ack-nhfb,
  keywords =     "interconnects; MPI; kernel bypass; remote shared
                 memory; SAN; performance evaluation",
}

@InProceedings{Eberle:2002:SHB,
  author =       "Hans Eberle and Nils Gura",
  title =        "Separated High-bandwidth and Low-latency Communication
                 in the Cluster Interconnect {Clint}",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap259.pdf",
  abstract =     "An interconnect for a high-performance cluster has to
                 be optimized in respect to both high throughput and low
                 latency. To avoid the tradeoff between throughput and
                 latency, the cluster interconnect Clint has a
                 segregated architecture that provides two physically
                 separate transmission channels: A bulk channel
                 optimized for high-bandwidth traffic and a quick
                 channel optimized for low-latency traffic. Different
                 scheduling strategies are applied. The bulk channel
                 uses a scheduler that globally allocates time slots on
                 the transmission paths before packets are sent off.
                 This way collisions as well as blockages are avoided.
                 In contrast, the quick channel takes a best-effort
                 approach by sending packets whenever they are available
                 thereby risking collisions and
                 retransmissions.\par

                 Simulation results clearly show the performance
                 advantages of the segregated architecture. The
                 carefully scheduled bulk channel can be loaded nearly
                 to its full capacity without exhibiting head-of-line
                 blocking that limits many networks while the quick
                 channel provides low-latency communication even in the
                 presence of high-bandwidth traffic.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Vetter:2002:EPE,
  author =       "Jeffrey S. Vetter and Andy Yoo",
  title =        "An Empirical Performance Evaluation of Scalable
                 Scientific Applications",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap222.pdf",
  abstract =     "We investigate the scalability, architectural
                 requirements, and performance characteristics of eight
                 scalable scientific applications. Our analysis is
                 driven by empirical measurements using statistical and
                 tracing instrumentation for both communication and
                 computation. Based on these measurements, we refine our
                 analysis into precise explanations of the factors that
                 influence performance and scalability for each
                 application; we distill these factors into common
                 traits and overall recommendations for both users and
                 designers of scalable platforms. Our experiments
                 demonstrate that some traits, such as improvements in
                 the scaling and performance of MPI's collective
                 operations, will benefit most applications. We also
                 find specific characteristics of some applications that
                 limit performance. For example, one application's
                 intensive use of a 64-bit, floating-point divide
                 instruction, which has high latency and is not
                 pipelined on the POWER3, limits the performance of the
                 application's primary computation.",
  acknowledgement = ack-nhfb,
}

@InProceedings{El-Ghazawi:2002:UPP,
  author =       "Tarek El-Ghazawi and Fran{\c{c}}ois Cantonnet",
  title =        "{UPC} Performance and Potential: {A} {NPB}
                 Experimental Study",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap316.pdf",
  abstract =     "UPC, or Unified Parallel C, is a parallel extension of
                 ANSI C. UPC follows a distributed shared memory
                 programming model aimed at leveraging the ease of
                 programming of the shared memory paradigm, while
                 enabling the exploitation of data locality. UPC
                 incorporates constructs that allow placing data near
                 the threads that manipulate them to minimize remote
                 accesses. This paper gives an overview of the concepts
                 and features of UPC and establishes, through extensive
                 performance measurements of NPB workloads, the
                 viability of the UPC programming language compared to
                 the other popular paradigms. Further, through
                 performance measurements we identify the challenges,
                 the remaining steps and the priorities for UPC. It will
                 be shown that with proper hand tuning libraries, UPC
                 performance will be comparable incorporating such
                 improvements into automatic compare quite favorably to
                 message passing in ease and optimized collective
                 operations to that of MPI. Furthermore, by compiler
                 optimizations, UPC will of programming.",
  acknowledgement = ack-nhfb,
  keywords =     "NPB (NAS Parallel Benchmark)",
}

@InProceedings{Worley:2002:SUC,
  author =       "Patrick H. Worley",
  title =        "Scaling the Unscalable: {A} Case Study on the
                 {AlphaServer SC}",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap155.pdf",
  abstract =     "A case study of the optimization of a climate modeling
                 application on the Compaq AlphaServer SC at the
                 Pittsburgh Supercomputer Center is used to illustrate
                 tools and techniques that are important to achieving
                 good performance scaling.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Schussman:2002:AVT,
  author =       "Greg Schussman and Brett Wilson and Kwok Ko and Ji
                 Qiang and Robert Ryne and Kwan-Liu Ma",
  title =        "Advanced Visualization Technology for Terascale
                 Particle Accelerator Simulations",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap224.pdf",
  abstract =     "This paper presents two new hardware-assisted
                 rendering techniques developed for interactive
                 visualization of the terascale data generated from
                 numerical modeling of next-generation accelerator
                 designs. The first technique, based on a hybrid
                 rendering approach, makes possible interactive
                 exploration of large-scale particle data from particle
                 beam dynamics modeling. The second technique, based on
                 a compact texture-enhanced representation, exploits the
                 advanced features of commodity graphics cards to
                 achieve perceptually effective visualization of the
                 very dense and complex electromagnetic fields produced
                 from the modeling of reflection and transmission
                 properties of open structures in an accelerator design.
                 Because of the collaborative nature of the overall
                 accelerator modeling project, the visualization
                 technology developed is for both desktop and remote
                 visualization settings. We have tested the techniques
                 using both time-varying particle data sets containing
                 up to one billion particles per time step and
                 electromagnetic field data sets with millions of mesh
                 elements.",
  acknowledgement = ack-nhfb,
  keywords =     "hardware-assisted techniques; high-performance
                 computing; particle accelerators; perception;
                 point-based rendering; scientific visualization; field
                 lines; texture mapping; time-varying data; vector field
                 visualization; visual cues; volume rendering",
}

@InProceedings{Wolf:2002:SPS,
  author =       "Matthew Wolf and Zhongtang Cai and Weiyun Huang and
                 Karsten Schwan",
  title =        "{SmartPointers}: Personalized Scientific Data Portals
                 In Your Hand",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap304.pdf",
  abstract =     "The SmartPointer system provides a paradigm for
                 utilizing multiple light-weight client endpoints in a
                 real-time scientific visualization infrastructure.
                 Together, the client and server infrastructure form a
                 new type of data portal for scientific computing. The
                 clients can be used to personalize data for the needs
                 of the individual scientist. This personalization of a
                 shared dataset is designed to allow multiple
                 scientists, each with their laptops or iPaqs to explore
                 the dataset from different angles and with different
                 personalized filters. As an example, iPaq clients can
                 display 2D derived data functions which can be used to
                 dynamically update and annotate the shared data space,
                 which might be visualized separately on a large
                 immersive display such as a CAVE. Measurements are
                 presented for such a system, built upon the ECho
                 middleware system developed at Georgia Tech.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Snavely:2002:FPM,
  author =       "Allan Snavely and Laura Carrington and Nicole Wolter
                 and Jesus Labarta and Rosa Badia and Avi Purkayastha",
  title =        "A Framework for Performance Modeling and Prediction",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap201.pdf",
  abstract =     "Cycle-accurate simulation is far too slow for modeling
                 the expected performance of full parallel applications
                 on large HPC systems. And just running an application
                 on a system and observing wallclock time tells you
                 nothing about why the application performs as it does
                 (and is anyway impossible on yet-to-be-built systems).
                 Here we present a framework for performance modeling
                 and prediction that is faster than cycle-accurate
                 simulation, more informative than simple benchmarking,
                 and is shown useful for performance investigations in
                 several dimensions.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Gopalan:2002:IRL,
  author =       "Kartik Gopalan and Tzi-cker Chiueh",
  title =        "Improving Route Lookup Performance Using Network
                 Processor Cache",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap272.pdf",
  abstract =     "Earlier research has shown that the route lookup
                 performance of a network processor can be significantly
                 improved by caching ranges of lookup/classification
                 keys rather than individual keys. While the previous
                 work focused specifically on reducing capacity misses,
                 we address two other important aspects --- (a) reducing
                 conflict misses and (b) cache consistency during
                 frequent route updates. We propose two techniques to
                 minimize conflict misses that aim to balance the number
                 of cacheable entries mapped to each cache set. They
                 offer different tradeoffs between performance and
                 simplicity while improving the average route lookup
                 time by 76\% and 45.2\% respectively. To maintain cache
                 consistency during frequent route updates, we propose a
                 selective cache invalidation technique that can limit
                 the degradation in lookup latency to within 10.2\%. Our
                 results indicate potentially large improvement in
                 lookup performance for network processors used at
                 Internet edge and motivate further research into
                 caching at the Internet core.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Athanasaki:2002:PST,
  author =       "Maria Athanasaki and Aristidis Sotiropoulos and
                 Georgios Tsoukalas and Nectarios Koziris",
  title =        "Pipelined Scheduling of Tiled Nested Loops onto
                 Clusters of {SMP}s using Memory Mapped Network
                 Interfaces",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap132.pdf",
  abstract =     "This paper describes the performance benefits attained
                 using enhanced network interfaces to achieve low
                 latency communication. We present a novel, pipelined
                 scheduling approach which takes advantage of DMA
                 communication mode, to send data to other nodes, while
                 the CPUs are performing calculations. We also use
                 zero-copy communication through pinned-down physical
                 memory regions, provided by NIC's driver modules. Our
                 testbed concerns the parallel execution of tiled nested
                 loops onto a cluster of SMP nodes with single PCI-SCI
                 NICs inside each node. In order to schedule tiles, we
                 apply a hyperplane-based grouping transformation to the
                 tiled space, so as to group together independent
                 neighboring tiles and assign them to the same SMP node.
                 Experimental evaluation illustrates that memory mapped
                 NICs with enhanced communication features enable the
                 use of a more advanced pipelined (overlapping)
                 schedule, which considerably improves performance,
                 compared to an ordinary blocking schedule, implemented
                 with conventional, CPU and kernel bounded,
                 communication primitives.",
  acknowledgement = ack-nhfb,
  keywords =     "memory mapped network interfaces; DMA; pipelined
                 schedules; tile grouping; communication overlapping;
                 SMPs",
}

@InProceedings{Hiraki:2002:DRU,
  author =       "Kei Hiraki and Mary Inaba and Junji Tamatsukuri and
                 Ryutaro Kurusu and Yukichi Ikuta and Hisashi Koga and
                 Akira Zinzaki",
  title =        "Data Reservoir: Utilization of Multi-Gigabit Backbone
                 Network for Data-Intensive Research",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap327.pdf",
  abstract =     "We propose data sharing facility for data intensive
                 scientific research, ``Data Reservoir''; which is
                 optimized to transfer huge amount of data files between
                 distant places fully utilizing multi-gigabit backbone
                 network. In addition, ``Data Reservoir'' can be used as
                 an ordinary UNIX server in local network without any
                 modification of server software. We use low-level
                 protocol and hierarchical striping to realize (1)
                 separation of bulk data transfer and local accesses by
                 caching, (2) file-system transparency, i.e.,
                 interoperable whatever in higher layer than disk
                 driver, including file system. (3) scalability for
                 network and storage. This paper shows our design,
                 implementation using iSCSI protocol [1] and their
                 performances for both 1Gbps model in the real network
                 and 10Gbps model in our laboratory.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Li:2002:NSA,
  author =       "Laura Grigori and Xiaoye S. Li",
  title =        "A New Scheduling Algorithm For Parallel Sparse {LU}
                 Factorization with Static Pivoting",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap139.pdf",
  abstract =     "In this paper we present a static scheduling algorithm
                 for parallel sparse LU factorization with static
                 pivoting. The algorithm is divided into mapping and
                 scheduling phases, using the symmetric pruned graphs of
                 LT and U to represent dependencies. The scheduling
                 algorithm is designed for driving the parallel
                 execution of the factorization on a distributed-memory
                 architecture. Experimental results and comparisons with
                 SuperLU DIST are reported after applying this algorithm
                 on real world application matrices on an IBM SP RS/6000
                 distributed memory machine.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Vuduc:2002:POB,
  author =       "Richard Vuduc and James W. Demmel and Katherine A.
                 Yelick and Shoaib Kamil and Rajesh Nishtala and
                 Benjamin Lee",
  title =        "Performance Optimizations and Bounds for Sparse
                 Matrix-Vector Multiply",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap317.pdf",
  abstract =     "We consider performance tuning, by code and data
                 structure reorganization, of sparse matrix-vector
                 multiply (SpMxV), one of the most important
                 computational kernels in scientific applications. This
                 paper addresses the fundamental questions of what
                 limits exist on such performance tuning, and how
                 closely tuned code approaches these limits.
                 Specifically, we develop upper and lower bounds on the
                 performance (Mflop/s) of SpMxV when tuned using our
                 previously proposed register blocking optimization.
                 These bounds are based on the non-zero pattern in the
                 matrix and the cost of basic memory operations, such as
                 cache hits and misses. We evaluate our tuned
                 implementations with respect to these bounds using
                 hardware counter data on 4 different platforms and on a
                 test set of 44 sparse matrices. We find that we can
                 often get within 20\% of the upper bound, particularly
                 on a class of matrices from finite element modeling
                 (FEM) problems; on non-FEM matrices, performance
                 improvements of $2\times$ are still possible. Lastly,
                 we present a new heuristic that selects optimal or
                 near-optimal register block sizes (the key tuning
                 parameters) more accurately than our previous
                 heuristic. Using the new heuristic, we show
                 improvements in SpMxV performance (Mflop/s) by as much
                 as $2.5\times$ over an untuned implementation.
                 Collectively, our results suggest that future
                 performance improvements, beyond those that we have
                 already demonstrated for SpMxV, will come from two
                 sources: (1) consideration of higher-level matrix
                 structures (e.g., exploiting symmetry, matrix
                 reordering, multiple register block sizes), and (2)
                 optimizing kernels with more opportunity for data reuse
                 (e.g., sparse matrix-multiple vector multiply,
                 multiplication of AT A by a vector).",
  acknowledgement = ack-nhfb,
}

@InProceedings{Teranishi:2002:NDM,
  author =       "Keita Teranishi and Padma Raghavan and Esmond Ng",
  title =        "A New Data-Mapping Scheme For Latency-Tolerant
                 Distributed Sparse Triangular Solution",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap238.pdf",
  abstract =     "This paper concerns latency-tolerant schemes for the
                 efficient parallel solution of sparse triangular linear
                 systems on distributed memory multiprocessors. Such
                 triangular solution is required when sparse Cholesky
                 factors are used to solve for a sequence of
                 right-hand-side vectors or when incomplete sparse
                 Cholesky factors are used to precondition a Conjugate
                 Gradients iterative solver. In such applications, the
                 use of traditional distributed substitution schemes can
                 create a performance bottleneck when the latency of
                 interprocessor communication is large. We had earlier
                 developed the Selective Inversion (SI) scheme to reduce
                 communication latency costs by replacing distributed
                 substitution by parallel matrix vector multiplication.
                 We now present a new two-way mapping of the triangular
                 sparse matrix to processors to improve the performance
                 of SI by halving its communication latency costs. We
                 provide analytic results for model sparse matrices and
                 we report on the performance of our scheme for parallel
                 preconditioning with incomplete sparse Cholesky
                 factors.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Traff:2002:IMP,
  author =       "Jesper Larsson Traff",
  title =        "Implementing the {MPI} Process Topology Mechanism",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap122.pdf",
  abstract =     "The topology functionality of the Message Passing
                 Interface (MPI) provides a portable,
                 architecture-independent means for adapting application
                 programs to the communication architecture of the
                 target hardware. However, current MPI implementations
                 rarely go beyond the most trivial implementation, and
                 simply performs no process remapping. We discuss the
                 potential of the topology mechanism for systems with a
                 hierarchical communication architecture like clusters
                 of SMP nodes. The MPI topology functionality is a weak
                 mechanism, and we argue about some of its shortcomings.
                 We formulate the topology optimization problem as a
                 graph embedding problem, and show that for hierarchical
                 systems it can be solved by graph partitioning. We
                 state the properties of a new heuristic for solving
                 both the embedding problem and the ``easier'' graph
                 partitioning problem. The graph partitioning based
                 framework has been fully implemented in MPI/SX for the
                 NEC SX-series of parallel vector computers. MPI/SX is
                 thus one of very few MPI implementations with a
                 non-trivial topology functionality. On a 4 node NEC
                 SX-6 significant communication performance improvements
                 are achieved with synthetic MPI benchmarks.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Bosilca:2002:MVT,
  author =       "George Bosilca and Aurelien Bouteiller and Franck
                 Cappello and Samir Djilali and Gilles Fedak and Cecile
                 Germain and Thomas Herault and Pierre Lemarinier and
                 Oleg Lodygensky and Frederic Magniette and Vincent Neri
                 and Anton Selikhov",
  title =        "{MPICH-V}: Toward a Scalable Fault Tolerant {MPI} for
                 Volatile Nodes",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap298.pdf",
  abstract =     "Global Computing platforms, large scale clusters and
                 future TeraGRID systems gather thousands of nodes for
                 computing parallel scientific applications. At this
                 scale, node failures or disconnections are frequent
                 events. This Volatility reduces the MTBF of the whole
                 system in the range of hours or minutes. We present
                 MPICH-V, an automatic Volatility tolerant MPI
                 environment based on uncoordinated checkpoint/ rollback
                 and distributed message logging. MPICH-V architecture
                 relies on Channel Memories, Checkpoint servers and
                 theoretically proven protocols to execute existing or
                 new, SPMD and Master-Worker MPI applications on
                 volatile nodes. To evaluate its capabilities, we run
                 MPICH-V within a framework for which the number of
                 nodes, Channels Memories and Checkpoint Servers can be
                 completely configured as well as the node Volatility.
                 We present a detailed performance evaluation of every
                 component of MPICH-V and its global performance for
                 non-trivial parallel applications. Experimental results
                 demonstrate good scalability and high tolerance to node
                 volatility.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Chiu:2002:PMM,
  author =       "Kenneth Chiu and Madhusudhan Govindaraju and Dennis
                 Gannon",
  title =        "The {Proteus Multiprotocol Message Library}",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap315.pdf",
  abstract =     "Grid systems span manifold organizations and
                 application domains. Because this diverse environment
                 inevitably engenders multiple protocols,
                 interoperability mechanisms are crucial to seamless,
                 pervasive access. This paper presents the design,
                 rationale, and implementation of the Proteus
                 multiprotocol library for integrating multiple message
                 protocols, such as SOAP and JMS, within one system.
                 Proteus decouples application code from protocol code
                 at run-time, allowing clients to incorporate separately
                 developed protocols without recompiling or halting.
                 Through generic serialization, which separates the
                 transfer syntax from the message type, protocols can
                 also be added independently of serialization routines.
                 We also show performance-enhancing mechanisms for Grid
                 services that examine metadata, but pass actual data
                 through opaquely (such as adapters). The interface
                 provided to protocol implementors is general enough to
                 support protocols as disparate as our current
                 implementations: SOAP, JMS, and binary. Proteus is
                 written in C++; a Java port is planned.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Parello:2002:IAA,
  author =       "David Parello and Olivier Temam and Jean-Marie
                 Verdun",
  title =        "On Increasing Architecture Awareness in Program
                 Optimizations to Bridge the Gap between Peak and
                 Sustained Processor Performance -- Matrix-Multiply
                 Revisited",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap107.pdf",
  abstract =     "As the complexity of processor architectures
                 increases, there is a widening gap between peak
                 processor performance and sustained processor
                 performance so that programs now tend to exploit only a
                 fraction of available performance. While there is a
                 tremendous amount of literature on program
                 optimizations, compiler optimizations lack efficiency
                 because they are plagued by three flaws: (1) they often
                 implicitly use simplified, if not simplistic, models of
                 processor architecture, (2) they usually focus on a
                 single processor component (e.g., cache) and ignore the
                 interactions among multiple components, (3) the most
                 heavily investigated components (e.g., caches)
                 sometimes have only a small impact on overall
                 performance. Through the in-depth analysis of a simple
                 program kernel, we want to show that understanding the
                 complex interactions between programs and the numerous
                 processor architecture components is both feasible and
                 critical to design efficient program optimizations.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Pike:2002:BTA,
  author =       "Geoff Pike and Paul N. Hilfinger",
  title =        "Better Tiling and Array Contraction for Compiling
                 Scientific Programs",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap260.pdf",
  abstract =     "Scientific programs often include multiple loops over
                 the same data; interleaving parts of different loops
                 may greatly improve performance. We exploit this in a
                 compiler for Titanium, a dialect of Java. Our compiler
                 combines reordering optimizations such as loop fusion
                 and tiling with storage optimizations such as array
                 contraction (eliminating or reducing the size of
                 temporary arrays). The programmers we have in mind are
                 willing to spend some time tuning their code and their
                 compiler parameters. Given that, and the difficulty in
                 statically selecting parameters such as tile sizes, it
                 makes sense to provide automatic parameter searching
                 alongside the compiler. Our strategy is to optimize
                 aggressively but to expose the compiler's decisions to
                 external control. We double or triple the performance
                 of Gauss--Seidel relaxation and multigrid (versus an
                 optimizing compiler without tiling and array
                 contraction), and we argue that ours is the best
                 compiler for that kind of program.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Vetter:2002:APE,
  author =       "Jeffrey S. Vetter and Patrick H. Worley",
  title =        "Asserting Performance Expectations",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap275.pdf",
  abstract =     "Traditional techniques for performance analysis
                 provide a means for extracting and analyzing raw
                 performance information from applications. Users then
                 compare this raw data to their performance expectations
                 for application constructs. This comparison can be
                 tedious for the scale of today's architectures and
                 software systems. To address this situation, we present
                 a methodology and prototype that allows users to assert
                 performance expectations explicitly in their source
                 code using performance assertions. As the application
                 executes, each performance assertion in the application
                 collects data implicitly to verify the assertion. By
                 allowing the user to specify a performance expectation
                 with individual code segments, the runtime system can
                 jettison raw data for measurements that pass their
                 expectation, while reacting to failures with a variety
                 of responses. We present several compelling uses of
                 performance assertions with our operational prototype,
                 including raising a performance exception, validating a
                 performance model, and adapting an algorithm
                 empirically at runtime.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Makino:2002:TSP,
  author =       "Junichiro Makino and Eiichiro Kokubo and Toshiyuki
                 Fukushige and Hiroshi Daisaka",
  title =        "A {29.5 Tflops} simulation of planetesimals in
                 {Uranus-Neptune} region on {GRAPE-6}",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap146.pdf",
  abstract =     "As an entry for the 2002 Gordon Bell performance
                 prize, we report the performance achieved on the
                 GRAPE-6 system for a simulation of the early evolution
                 of the protoplanet-planetesimal system of the
                 Uranus-Neptune region. GRAPE-6 is a special-purpose
                 computer for astrophysical N-body calculations. The
                 present configuration has 2048 custom pipeline chips,
                 each containing six pipeline processors for the
                 calculation of gravitational interactions between
                 particles. Its theoretical peak performance is 63.4
                 Tflops. The actual performance obtained was 29.5
                 Tflops, for a simulation of the early evolution of
                 outer Solar system with 1.8 million planetesimals and
                 two massive protoplanets.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Bhardwaj:2002:SSS,
  author =       "Manoj Bhardwaj and Kendall Pierson and Garth Reese and
                 Tim Walsh and David Day and Ken Alvin and James Peery
                 and Charbel Farhat and Michel Lesoinne",
  title =        "{Salinas}: {A} Scalable Software for High-Performance
                 Structural and Solid Mechanics Simulations",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap216.pdf",
  abstract =     "We present Salinas, a scalable implicit software
                 application for the finite element static and dynamic
                 analysis of complex structural real-world systems. This
                 relatively complete code and a long list of users
                 engineering software with more than 100,000 lines of
                 sustains 292.5 Gflop/s on 2,940 ASCI Red processors,
                 and 1.16 Tflop/s on 3,375 ASCI White processors.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Phillips:2002:NBS,
  author =       "James C. Phillips and Gengbin Zheng and Sameer Kumar
                 and Laxmikant V. Kal{\'e}",
  title =        "{NAMD}: Biomolecular Simulation on Thousands of
                 Processors",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap277.pdf",
  abstract =     "NAMD is a fully featured, production molecular
                 dynamics program for high performance simulation of
                 large biomolecular systems. We have previously, at
                 SC2000, presented scaling results for simulations with
                 cutoff electrostatics on up to 2048 processors of the
                 ASCI Red machine, achieved with an object-based hybrid
                 force and spatial decomposition scheme and an
                 aggressive measurement-based predictive load balancing
                 framework. We extend this work by demonstrating similar
                 scaling on the much faster processors of the PSC
                 Lemieux Alpha cluster, and for simulations employing
                 efficient (order N log N) particle mesh Ewald full
                 electrostatics. This unprecedented scalability in a
                 biomolecular simulation code has been attained through
                 latency tolerance, adaptation to multiprocessor nodes,
                 and the direct use of the Quadrics Elan library in
                 place of MPI by the Charm++/Converse parallel runtime
                 system.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Lee:2002:IOG,
  author =       "William Lee and Anthony Mayer and Steven Newhouse",
  title =        "{ICENI}: An {Open Grid Service Architecture}
                 Implemented with {Jini}",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap253.pdf",
  abstract =     "The move towards Service Grids, where services are
                 composed to meet the requirements of a user community
                 within constraints specified by the resource provider,
                 present many challenges to service provision and
                 description. To support our research activities in the
                 autonomous composition of services to form a Semantic
                 Service Grid we describe the adoption within ICENI of
                 web services to enable interoperability with the
                 recently proposed Open Grid Services Architecture.",
  acknowledgement = ack-nhfb,
  keywords =     "Computational Grids; Web Services; Semantic Grid",
}

@InProceedings{Hoschek:2002:WSD,
  author =       "Wolfgang Hoschek",
  title =        "The {Web Service Discovery Architecture}",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap161.pdf",
  abstract =     "In this paper, we propose the Web Service Discovery
                 Architecture (WSDA). At runtime, Grid applications can
                 use this architecture to discover and adapt to remote
                 services. WSDA promotes an interoperable web service
                 discovery layer by defining appropriate services,
                 interfaces, operations and protocol bindings, based on
                 industry standards. It is unified because it subsumes
                 an array of disparate concepts, interfaces and
                 protocols under a single semi-transparent umbrella. It
                 is modular because it defines a small set of orthogonal
                 multipurpose communication primitives (building blocks)
                 for discovery. These primitives cover service
                 identification, service description retrieval, data
                 publication as well as minimal and powerful query
                 support. The architecture is open and flexible because
                 each primitive can be used, implemented, customized and
                 extended in many ways. It is powerful because the
                 individual primitives can be combined and plugged
                 together by specific clients and services to yield a
                 wide range of behaviors and emerging synergies.",
  acknowledgement = ack-nhfb,
  keywords =     "WSDA (Web Service Discovery Architecture)",
}

@InProceedings{Pierce:2002:IWS,
  author =       "Marlon Pierce and Geoffrey Fox and Choonhan Youn and
                 Steve Mock and Kurt Mueller and Ozgur Balsoy",
  title =        "Interoperable {Web} Services for Computational
                 Portals",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap284.pdf",
  abstract =     "Computational web portals are designed to simplify
                 access to diverse sets of high performance computing
                 resources, typically through an interface to
                 computational Grid tools. An important shortcoming of
                 these portals is their lack of interoperable and
                 reusable services. This paper presents an overview of
                 research efforts undertaken by our group to build
                 interoperating portal services around a Web Services
                 model. We present a comprehensive view of an
                 interoperable portal architecture, beginning with core
                 portal services that can be used to build Application
                 Web Services, which in turn may be aggregated and
                 managed through portlet containers.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Stamatakis:2002:APM,
  author =       "Alexandros P. Stamatakis and Thomas Ludwig and Harald
                 Meier and Marty J. Wolf",
  title =        "Accelerating Parallel Maximum Likelihood-based
                 Phylogenetic Tree Calculations using Subtree Equality
                 Vectors",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap113.pdf",
  abstract =     "Heuristics for calculating phylogenetic trees for a
                 large sets of aligned rRNA sequences based on the
                 maximum likelihood method are computationally
                 expensive. The core of most parallel algorithms, which
                 accounts for the greatest part of computation time, is
                 the tree evaluation function,that calculates the
                 likelihood value for each tree topology. This paper
                 describes and uses Subtree Equality Vectors (SEVs) to
                 reduce the number of required floating point operations
                 during topology evaluation. We integrated our
                 optimizations into various sequential programs and into
                 parallel fastDNAml, one of the most common and
                 efficient parallel programs for calculating large
                 phylogenetic trees. Experimental results for our
                 parallel program, which renders exactly the same output
                 as parallel fastDNAml show global run time improvements
                 of 26\% to 65\%. The optimization scales best on
                 clusters of PCs, which also implies a substantial cost
                 saving factor for the determination of large trees.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Akcelik:2002:PMG,
  author =       "Volkan Akcelik and George Biros and Omar Ghattas",
  title =        "Parallel Multiscale {Gauss--Newton--Krylov} Methods for
                 Inverse Wave Propagation",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap330.pdf",
  abstract =     "One of the outstanding challenges of computational
                 science and engineering is large-scale nonlinear
                 parameter estimation of systems governed by partial
                 differential equations. These are known as inverse
                 problems, in contradistinction to the forward problems
                 that usually characterize large-scale simulation.
                 Inverse problems are significantly more difficult to
                 solve than forward problems, due to ill-posedness,
                 large dense ill-conditioned operators, multiple minima,
                 space-time coupling, and the need to solve the forward
                 problem repeatedly. We present a parallel algorithm for
                 inverse problems governed by time-dependent PDEs, and
                 scalability results for an inverse wave propagation
                 problem of determining the material field of an
                 acoustic medium. The difficulties mentioned above are
                 addressed through a combination of total variation
                 regularization, preconditioned matrix-free
                 Gauss--Newton--Krylov iteration, algorithmic
                 checkpointing, and multiscale continuation. We are able
                 to solve a synthetic inverse wave propagation problem
                 though a pelvic bone geometry involving 2.1 million
                 inversion parameters in 3 hours on 256 processors of
                 the Terascale Computing System at the Pittsburgh
                 Supercomputing Center.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Hariharan:2002:SPF,
  author =       "Bhanu Hariharan and Srinivas Aluru and Balasubramaniam
                 Shanker",
  title =        "A Scalable Parallel Fast Multipole Method for Analysis
                 of Scattering from Perfect Electrically Conducting
                 Surfaces",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap295.pdf",
  abstract =     "In this paper, we develop a parallel Fast Multipole
                 Method (FMM) based solution for computing the scattered
                 electromagnetic fields from a Perfect Electrically
                 Conducting (PEC) surface. The main contributions of
                 this work are the development of parallel algorithms
                 with the following characteristics: (1) provably
                 efficient worst-case run-time irrespective of the shape
                 of the scatterer, (2) communication efficiency, and (3)
                 guaranteed load balancing within a small constant
                 factor. We have developed a scalable, parallel code and
                 validated it against surfaces for which solution can be
                 computed analytically, and against serial software. The
                 efficiency and scalability of the code is demonstrated
                 with experimental results on an IBM xSeries cluster.
                 Though developed in the context of this particular
                 application, our algorithms can be used in other
                 applications involving parallel FMM.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Karniadakis:2002:DLP,
  author =       "Suchuan Dong and George Em. Karniadakis",
  title =        "Dual-Level Parallelism for Deterministic and
                 Stochastic {CFD} Problems",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap137.pdf",
  abstract =     "A hybrid two-level parallelism using MPI/OpenMP is
                 implemented in the general-purpose spectral/hp element
                 CFD code NekTar to take advantage of the hierarchical
                 structures arising in deterministic and stochastic CFD
                 problems. We take a coarse grain approach to
                 shared-memory parallelism with OpenMP and employ a
                 workload-splitting scheme that can reduce the OpenMP
                 synchronizations to the minimum. The hybrid
                 implementation shows good scalability with respect to
                 both the problem size and the number of processors in
                 case of a fixed problem size. With the same number of
                 processors, the hybrid model with 2 (or 4) OpenMP
                 threads per MPI process is observed to perform better
                 than pure MPI and pure OpenMP on the NCSA SGI Origin
                 2000, while the pure MPI model performs the best on the
                 IBM SP3 at SDSC and on the Compaq Alpha cluster at PSC.
                 A key new result is that the use of threads facilitates
                 effectively prefinement, which is crucial to adaptive
                 discretization using high-order methods.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Tapus:2002:AHT,
  author =       "Cristian T{\u{a}}pu{\c{s}} and I-Hsin Chung and
                 Jeffrey K. Hollingsworth",
  title =        "{Active Harmony}: Towards Automated Performance
                 Tuning",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap138.pdf",
  abstract =     "In this paper, we present the Active Harmony automated
                 runtime tuning system. We describe the interface used
                 by programs to make applications tunable. We present
                 the Library Specification Layer which helps program
                 library developers expose multiple variations of the
                 same API using different algorithms. The Library
                 Specification Language helps to select the most
                 appropriate program library to tune the overall
                 performance. We also present the optimization algorithm
                 used to adjust parameters in the application and the
                 libraries. Finally, we present results that show how
                 the system is able to tune several real applications.
                 The automated tuning system is able to tune the
                 application parameters to within a few percent of the
                 best value after evaluating only 11 out of over 1,700
                 possible configurations.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Rauber:2002:LSH,
  author =       "Thomas Rauber and Gudula R{\"u}nger",
  title =        "Library Support for Hierarchical Multi-Processor
                 Tasks",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap176.pdf",
  abstract =     "The paper considers the modular programming with
                 hierarchically structured multi-processor tasks on top
                 of SPMD tasks for distributed memory machines. The
                 parallel execution requires a corresponding
                 decomposition of the set of processors into a
                 hierarchical group structure onto which the tasks are
                 mapped. This results in a multi-level group SPMD
                 computation model with varying processor group
                 structures. The advantage of this kind of mixed task
                 and data parallelism is a potential to reduce the
                 communication overhead and to increase scalability. We
                 present a runtime library to support the coordination
                 of hierarchically structured multi-processor tasks. The
                 library exploits an extended parallel group SPMD
                 programming model and manages the entire task execution
                 including the dynamic hierarchy of processor groups.
                 The library is built on top of MPI, has an easy-to-use
                 interface, and leads to only a marginal overhead while
                 allowing static planning and dynamic restructuring.
                 Keywords: mixed task and data parallelism,
                 multiprocessor tasks, multilevel group SPMD,
                 hierarchical decomposition of processor sets, library
                 support, distributed memory",
  acknowledgement = ack-nhfb,
}

@InProceedings{Frachtenberg:2002:SLF,
  author =       "Eitan Frachtenberg and Fabrizio Petrini and Juan
                 Fernandez and Salvador Coll and Scott Pakin",
  title =        "{STORM}: Lightning-Fast Resource Management",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap297.pdf",
  abstract =     "Although workstation clusters are a common platform
                 for high-performance computing (HPC), they remain more
                 difficult to manage than sequential systems or even
                 symmetric multiprocessors. Furthermore, as cluster
                 sizes increase, the quality of the resource-management
                 subsystem --- essentially, all of the code that runs on
                 a cluster other than the applications --- increasingly
                 impacts application efficiency. In this paper, we
                 present STORM, a resource-management framework designed
                 for scalability and performance. The key innovation
                 behind STORM is a software architecture that enables
                 resource management to exploit low-level network
                 features. As a result of this HPC-application-like
                 design, STORM is orders of magnitude faster than the
                 best reported results in the literature on two sample
                 resource-management functions: job launching and
                 process scheduling.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Colarelli:2002:MAI,
  author =       "Dennis Colarelli and Dirk Grunwald",
  title =        "Massive Arrays of Idle Disks For Storage Archives",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap312.pdf",
  abstract =     "The declining costs of commodity disk drives is
                 rapidly changing the economics of deploying large
                 amounts of online or near-line storage. Conventional
                 mass storage systems use either high performance RAID
                 clusters, automated tape libraries or a combination of
                 tape and disk. In this paper, we analyze an alternative
                 design using massive arrays of idle disks, or MAID. We
                 argue that this storage organization provides storage
                 densities matching or exceeding those of tape libraries
                 with performance similar to disk arrays. Moreover, we
                 show that with effective power management of individual
                 drives, this performance can be achieved using a very
                 small power budget. In particular, we show that our
                 power management strategy can result in the performance
                 comparable to an always-on RAID system while using
                 $1/15$th the power of such a RAID system.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Sterling:2002:GMP,
  author =       "Thomas L. Sterling and Hans P. Zima",
  title =        "{Gilgamesh}: {A} Multithreaded Processor-In-Memory
                 Architecture for Petaflops Computing",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap105.pdf",
  abstract =     "Processor-in-Memory (PIM) architectures avoid the von
                 Neumann bottleneck in conventional machines by
                 integrating high-density DRAM and CMOS logic on the
                 same chip. Parallel systems based on this new
                 technology are expected to provide higher scalability,
                 adaptability, robustness, fault tolerance and lower
                 power consumption than current MPPs or commodity
                 clusters. In this paper we describe the design of
                 Gilgamesh, a PIM-based massively parallel architecture,
                 and elements of its execution model. Gilgamesh extends
                 existing PIM capabilities by incorporating advanced
                 mechanisms for virtualizing tasks and data and
                 providing adaptive resource management for load
                 balancing and latency tolerance. The Gilgamesh
                 execution model is based on macroservers, a middleware
                 layer which supports object-based runtime management of
                 data and threads allowing explicit and dynamic control
                 of locality and load balancing. The paper concludes
                 with a discussion of related research activities and an
                 outlook to future work.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Acacio:2002:OPA,
  author =       "Manuel E. Acacio and Jose Gonzalez and Jose M. Garcia
                 and Jose Duato",
  title =        "Owner Prediction for Accelerating Cache-to-Cache
                 Transfer Misses in a cc-{NUMA} Architecture",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap102.pdf",
  abstract =     "Cache misses for which data must be obtained from a
                 remote cache (cache-to-cache transfer misses) account
                 for an important fraction of the total miss rate.
                 Unfortunately, cc-NUMA designs put the access to the
                 directory information into the critical path of 3-hop
                 misses, which significantly penalizes them compared to
                 SMP designs. This work studies the use of owner
                 prediction as a means of providing cc-NUMA
                 multiprocessors with a more efficient support for
                 cache-to-cache transfer misses. Our proposal comprises
                 an effective prediction scheme as well as a coherence
                 protocol designed to support the use of prediction.
                 Results indicate that owner prediction can
                 significantly reduce the latency of cache-to-cache
                 transfer misses, which translates into speed-ups on
                 application performance up to 12\%. In order to also
                 accelerate most of those 3-hop misses that are either
                 not predicted or mispredicted, the inclusion of a small
                 and fast directory cache in every node is evaluated,
                 leading to improvements up to 16\% on the final
                 performance.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Ishihara:2002:TDN,
  author =       "Mitsuo Yokokawa and Ken'ichi Itakura and Atsuya Uno
                 and Takashi Ishihara and Yukio Kaneda",
  title =        "{16.4 Tflops} Direct Numerical Simulation of
                 Turbulence by {Fourier} Spectral Method on the {Earth
                 Simulator}",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap273.pdf",
  abstract =     "The high-resolution direct numerical simulations
                 (DNSs) of incompressible turbulence with numbers of
                 grid points up to 40963 have been executed on the Earth
                 Simulator (ES). The DNSs are based on the Fourier
                 spectral method, so that the equation for mass
                 conservation is accurately solved. In DNS based on the
                 spectral method, most of the computation time is
                 consumed in calculating the three-dimensional (3D) Fast
                 Fourier Transform (FFT), which requires huge-scale
                 global data transfer and has been the major stumbling
                 block that has prevented truly high-performance
                 computing. By implementing new methods to efficiently
                 perform the 3D-FFT on the ES, we have achieved DNS at
                 16.4 Tflops on 20483 grid points. The DNS yields an
                 energy spectrum exhibiting a wide inertial subrange, in
                 contrast to previous DNSs with lower resolutions, and
                 therefore provides valuable data for the study of the
                 universal features of turbulence at large Reynolds
                 number.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Sakagami:2002:TTD,
  author =       "Hitoshi Sakagami and Hitoshi Murai and Yoshiki Seo and
                 Mitsuo Yokokawa",
  title =        "{14.9 TFLOPS} Three-dimensional Fluid Simulation for
                 Fusion Science with {HPF} on the {Earth Simulator}",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap147.pdf",
  abstract =     "We succeeded in getting 14.9 TFLOPS performance when
                 running a plasma simulation code IMPACT-3D parallelized
                 with High Performance Fortran on 512 nodes of the Earth
                 Simulator. The theoretical peak performance of the 512
                 nodes is 32 TFLOPS, which means 45\% of the peak
                 performance was obtained with HPF. IMPACT-3D is an
                 implosion analysis code using TVD scheme, which
                 performs three-dimensional compressible and inviscid
                 Eulerian fluid computation with the explicit 5-point
                 stencil scheme for spatial differentiation and the
                 fractional time step for time integration. The mesh
                 size is 2048x2048x4096, and the third dimension was
                 distributed for the parallelization. The HPF system
                 used in the evaluation is HPF/ES, developed for the
                 Earth Simulator by enhancing NEC HPF/SX V2 mainly in
                 communication scalability. Shift communications were
                 manually tuned to get best performance by using HPF/JA
                 extensions, which was designed to give the users more
                 control over sophisticated parallelization and
                 communication optimizations.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Shingu:2002:TGA,
  author =       "Satoru Shingu and Hiroshi Takahara and Hiromitsu
                 Fuchigami and Masayuki Yamada and Yoshinori Tsuda and
                 Wataru Ohfuchi and Yuji Sasaki and Kazuo Kobayashi and
                 Takashi Hagiwara and Shin-ichi Habata and Mitsuo
                 Yokokawa and Hiroyuki Itoh and Kiyoshi Otsuka",
  title =        "A {26.58 Tflops} Global Atmospheric Simulation with
                 the Spectral Transform Method on the {Earth
                 Simulator}",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap331.pdf",
  abstract =     "A spectral atmospheric general circulation model
                 called AFES (AGCM for Earth Simulator) was developed
                 and optimized for the architecture of the Earth
                 Simulator (ES). The ES is a massively parallel vector
                 supercomputer that consists of 640 processor nodes
                 interconnected by a single stage crossbar network with
                 its total peak performance of 40.96 Tflops. The
                 sustained performance of 26.58 Tflops was achieved for
                 a high resolution simulation (T1279L96) with AFES by
                 utilizing the full 640-node configuration of the ES.
                 The resulting computing efficiency is 64.9\% of the
                 peak performance, well surpassing that of conventional
                 weather/climate applications having just 25--50\%
                 efficiency even on vector parallel computers. This
                 remarkable performance proves the effectiveness of the
                 ES as a viable means for practical applications.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Noordergraaf:2002:SSI,
  author =       "Lisa Noordergraaf and Robert Zak",
  title =        "{SMP} System Interconnect Instrumentation for
                 Performance Analysis",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap158.pdf",
  abstract =     "The system interconnect is often the performance
                 bottleneck in SMP computers. Although modern SMPs
                 include event counters on processors and interconnects,
                 these provide limited information about the interaction
                 of processors vying for shared resources. Additionally,
                 transaction sources and addresses are not readily
                 available, making analysis of access patterns and data
                 locality difficult. Enhanced system interconnect
                 instrumentation is required to extract this
                 information.\par

                 This paper describes instrumentation implemented for
                 monitoring the system interconnect on Sun Fire\TM{}
                 servers. The instrumentation supports sophisticated
                 programmable filtering of event counters, allowing us
                 to construct histograms of system interconnect
                 activity, and a FIFO to capture trace sequences. Our
                 implementation results in a very small hardware
                 footprint, making it appropriate for inclusion in
                 commodity hardware.\par

                 We also describe a sampling of software tools and
                 results based on this infrastructure. Applications have
                 included performance profiling, architectural studies,
                 and hardware bringup and debugging.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Spencer:2002:EMP,
  author =       "Matthew Spencer and Renato Ferreira and Michael Beynon
                 and Tahsin Kurc and Umit Catalyurek and Alan Sussman
                 and Joel Saltz",
  title =        "Executing Multiple Pipelined Data Analysis Operations
                 in the Grid",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap258.pdf",
  abstract =     "Processing of data in many data analysis applications
                 can be represented as an acyclic, coarse grain data
                 flow, from data sources to the client. This paper is
                 concerned with scheduling of multiple data analysis
                 operations, each of which is represented as a pipelined
                 chain of processing on data. We define the scheduling
                 problem for effectively placing components onto Grid
                 resources, and propose two scheduling algorithms.
                 Experimental results are presented using a
                 visualization application.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Dail:2002:DSA,
  author =       "Holly Dail and Henri Casanova and Fran Berman",
  title =        "A Decoupled Scheduling Approach for the {GrADS}
                 Program Development Environment",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap229.pdf",
  abstract =     "Program development environments are instrumental in
                 providing users with easy and efficient access to
                 parallel computing platforms. While a number of such
                 environments have been widely accepted and used for
                 traditional HPC systems, there are currently no widely
                 used environments for Grid programming. The goal of the
                 Grid Application Development Software (GrADS) project
                 is to develop a coordinated set of tools, libraries and
                 run-time execution facilities for Grid program
                 development. In this paper, we describe a Grid
                 scheduler component that is integrated as part of the
                 GrADS software system. Traditionally, application-level
                 schedulers (e.g. AppLeS) have been tightly integrated
                 with the application itself and were not easily applied
                 to other applications. Our design is generic: we
                 decouple the scheduler core (the search procedure) from
                 the application-specific (e.g. application performance
                 models) and platform-specific (e.g. collection of
                 resource information) components used by the search
                 procedure. We provide experimental validation of our
                 approach for two representative regular, iterative
                 parallel programs in a variety of real-world Grid
                 testbeds. Our scheduler consistently outperforms static
                 and user-driven scheduling methods.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Annis:2002:ACV,
  author =       "James Annis and Yong Zhao and Jens Voeckler and
                 Michael Wilde and Steve Kent and Ian Foster",
  title =        "Applying {Chimera} Virtual Data Concepts to Cluster
                 Finding in the {Sloan Sky Survey}",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap299.pdf",
  abstract =     "In many scientific disciplines --- especially long
                 running, data-intensive collaborations --- it is
                 important to track all aspects of data capture,
                 production, transformation, and analysis. In principle,
                 one can then audit, validate, reproduce, and/or re-run
                 with corrections various data transformations. We have
                 recently proposed and prototyped the Chimera virtual
                 data system, a new database-driven approach to this
                 problem. We present here a major application study in
                 which we apply Chimera to a challenging data analysis
                 problem: the identification of galaxy clusters within
                 the Sloan Digital Sky Survey. We describe the problem,
                 its computational procedures, and the use of Chimera to
                 plan and orchestrate the workflow of thousands of tasks
                 on a data grid comprising hundreds of computers. This
                 experience suggests that a general set of tools can
                 indeed enhance the accuracy and productivity of
                 scientific data reduction and that further development
                 and application of this paradigm will offer great
                 value.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Andrade:2002:APG,
  author =       "Henrique Andrade and Tahsin Kurc and Alan Sussman and
                 Joel Saltz",
  title =        "{Active Proxy-G}: Optimizing the Query Execution
                 Process in the Grid",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap219.pdf",
  abstract =     "The Grid environment facilitates collaborative work
                 and allows many users to query and process data over
                 geographically dispersed data repositories. Over the
                 past several years, there has been a growing interest
                 in developing applications that interactively analyze
                 datasets, potentially in a collaborative setting. We
                 describe the Active Proxy-G service that is able to
                 cache query results, use those results for answering
                 new incoming queries, generate subqueries for the parts
                 of a query that cannot be produced from the cache, and
                 submit the subqueries for final processing at
                 application servers that store the raw datasets. We
                 present an experimental evaluation to illustrate the
                 effects of various design tradeoffs. We also show the
                 benefits that two real applications gain from using the
                 middleware.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Chervenak:2002:GFC,
  author =       "Ann Chervenak and Ewa Deelman and Ian Foster and
                 Leanne Guy and Wolfgang Hoschek and Adriana Iamnitchi
                 and Carl Kesselman and Peter Kunszt and Matei Ripeanu
                 and Bob Schwartzkopf and Heinz Stockinger and Kurt
                 Stockinger and Brian Tierney",
  title =        "{Giggle}: {A} Framework for Constructing Scalable
                 Replica Location Services",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap239.pdf",
  abstract =     "In wide area computing systems, it is often desirable
                 to create remote read-only copies (replicas) of files.
                 Replication can be used to reduce access latency,
                 improve data locality, and/or increase robustness,
                 scalability and performance for distributed
                 applications. We define a replica location service
                 (RLS) as a system that maintains and provides access to
                 information about the physical locations of copies. An
                 RLS typically functions as one component of a data grid
                 architecture. This paper makes the following
                 contributions. First, we characterize RLS requirements.
                 Next, we describe a parameterized architectural
                 framework, which we name Giggle (for GIGa-scale Global
                 Location Engine), within which a wide range of RLSs can
                 be defined. We define several concrete instantiations
                 of this framework with different performance
                 characteristics. Finally, we present initial
                 performance results for an RLS prototype, demonstrating
                 that RLS systems can be constructed that meet
                 performance goals.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Bland:2002:EEI,
  author =       "P. H. Worley and T. H. {Dunigan, Jr.} and M. R. Fahey
                 and J. B. {White III} and A. S. Bland",
  title =        "Early Evaluation of the {IBM p690}",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap196.pdf",
  abstract =     "Oak Ridge National Laboratory recently received 27
                 32-way IBM pSeries 690 SMP nodes. In this paper, we
                 describe our initial evaluation of the p690
                 architecture, focusing on the performance of benchmarks
                 and applications that are representative of the
                 expected production workload.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Adiga:2002:OBS,
  author =       "N. R. Adiga and G. Almasi and G. S. Almasi and Y.
                 Aridor and R. Barik and D. Beece and R. Bellofatto and
                 G. Bhanot and R. Bickford and M. Blumrich and
                 A. A. Bright and J. Brunheroto and C. Cacaval and
                 J. Casta{\~n}os and W. Chan and L. Ceze and P. Coteus
                 and S. Chatterjee and D. Chen and G. Chiu and
                 T. M. Cipolla and P. Crumley and K. M. Desai and
                 A. Deutsch and T. Domany and M. B. Dombrowa and
                 W. Donath and M. Eleftheriou and C. Erway and J. Esch
                 and B. Fitch and J. Gagliano and A. Gara and R. Garg
                 and R. Germain and M. E. Giampapa and B. Gopalsamy and
                 J. Gunnels and M. Gupta and F. Gustavson and S. Hall
                 and R. A. Haring and D. Heidel and P. Heidelberger and
                 L. M. Herger and D. Hoenicke and R. D. Jackson and
                 T. Jamal-Eddine and G. V. Kopcsay and E. Krevat and
                 M. P. Kurhekar and A. P. Lanzetta and D. Lieber and
                 L. K. Liu and M. Lu and M. Mendell and A. Misra and
                 Y. Moatti and L. Mok and J. E. Moreira and
                 B. J. Nathanson and M. Newton and M. Ohmacht and
                 A. Oliner and V. Pandit and R. B. Pudota and R. Rand
                 and R. Regan and B. Rubin and A. Ruehli and S. Rus and
                 R. K. Sahoo and A. Sanomiya and E. Schenfeld and
                 M. Sharma and E. Shmueli and S. Singh and P. Song and
                 V. Srinivasan and B. D. Steinmacher-Burow and
                 K. Strauss and C. Surovic and R. Swetz and T. Takken
                 and R. B. Tremaine and M. Tsao and A. R. Umamaheshwaran
                 and P. Verma and P. Vranas and T. J. C. Ward and
                 M. Wazlowski and W. Barrett and C. Engel and B. Drehmel
                 and B. Hilgart and D. Hill and F. Kasemkhani and
                 D. Krolak and C. T. Li and T. Liebsch and J. Marcella
                 and A. Muff and A. Okomo and M. Rouse and A. Schram and
                 M. Tubbs and G. Ulsh and C. Wait and J. Wittrup and
                 M. Bae and K. Dockser and L. Kissel and M. K. Seager
                 and J. S. Vetter and K. Yates",
  title =        "An Overview of the {BlueGene/L} Supercomputer",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap207.pdf",
  abstract =     "This paper gives an overview of the BlueGene/L
                 Supercomputer. This is a jointly funded research
                 partnership between IBM and the Lawrence Livermore
                 National Laboratory as part of the United States
                 Department of Energy ASCI Advanced Architecture
                 Research Program. Application performance and scaling
                 studies have recently been initiated with partners at a
                 number of academic and government institutions,
                 including the San Diego Supercomputer Center and the
                 California Institute of Technology. This massively
                 parallel system of 65,536 nodes is based on a new
                 architecture that exploits system-on-a-chip technology
                 to deliver target peak processing power of 360
                 teraFLOPS (trillion floating-point operations per
                 second). The machine is scheduled to be operational in
                 the 2004-2005 time frame, at price/performance and
                 power consumption/performance targets unobtainable with
                 conventional architectures.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Warren:2002:HDC,
  author =       "Michael S. Warren and Eric H. Weigle and Wu-Chun
                 Feng",
  title =        "High-Density Computing: {A} 240-Processor {Beowulf} in
                 One Cubic Meter",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap210.pdf",
  abstract =     "We present results from computations on Green Destiny,
                 a 240-processor Beowulf cluster which is contained
                 entirely within a single 19-inch wide 42U rack. The
                 cluster consists of 240 Transmeta TM5600 667-MHz CPUs
                 mounted on RLX Technologies motherboard blades. The
                 blades are mounted side-by-side in an RLX 3U rack-mount
                 chassis, which holds 24 blades. The overall cluster
                 contains 10 chassis and associated Fast and Gigabit
                 Ethernet switches. The system has a footprint of 0.5
                 meter2 (6 square feet), a volume of 0.85 meter3 (30
                 cubic feet) and a measured power dissipation under load
                 of 5200 watts (including network switches). We have
                 measured the performance of the cluster using a
                 gravitational treecode N-body simulation of galaxy
                 formation using 200 million particles, which sustained
                 an average of 38.9 Gflops on 212 nodes of the system.
                 We also present results from a three-dimensional
                 hydrodynamic simulation of a core-collapse supernova",
  acknowledgement = ack-nhfb,
  keywords =     "Beowulf; cluster; blade server; RLX; Transmeta; code
                 morphing; VLIW; performance-per-square-foot;
                 MIPS-per-watt",
}

@InProceedings{Kim:2002:UDC,
  author =       "Seung Jo Kim and Joon-Seok Hwang and Chang Sung Lee
                 and Sangsan Lee",
  title =        "Utilization of Departmental Computing {GRID} System
                 for Development of an Artificial Intelligent Tapping
                 Inspection Method, Tapping Sound Analysis",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap167.pdf",
  abstract =     "Tapping Sound Analysis is a new NDE method, which
                 determines the existence of subsurface defects by
                 comparing the tapping sound of test structure and
                 original healthy structure. The tapping sound of
                 original healthy structure is named sound print of the
                 structure and is obtained through high precision
                 computation. Because many tapping points are required
                 to obtain the exact sound print data, many times of
                 tapping sound simulation are required. The simulation
                 of tapping sound requires complicated numerical
                 procedures. Departmental Computing GRID system was
                 utilized to run numerical simulations. Three cluster
                 systems and one PC-farm system comprise DCG system.
                 Tapping sound simulations were launched and monitored
                 through Globus and CONDOR. A total of 160 Tera
                 floating-point (double-precision) operations was
                 performed and the elapsed time was 41,880 sec. From the
                 numerical experiments, Grid computing technology
                 reduced the necessary time to make sound print database
                 and made TSA a feasible and practical methodology.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Kikuchi:2002:CSG,
  author =       "Hideaki Kikuchi and Rajiv K. Kalia and Aiichiro Nakano
                 and Priya Vashishta and Hiroshi Iyetomi and Shuji Ogata
                 and Takahisa Kouno and Fuyuki Shimojo and Kenji Tsuruta
                 and Subhash Saini",
  title =        "Collaborative Simulation Grid: Multiscale
                 Quantum-Mechanical\slash Classical Atomistic
                 Simulations on Distributed {PC} Clusters in the {US}
                 and {Japan}",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap111.pdf",
  abstract =     "A multidisciplinary, collaborative simulation has been
                 performed on a Grid of geographically distributed PC
                 clusters. The multiscale simulation approach seamlessly
                 combines (i) atomistic simulation based on the
                 molecular dynamics (MD) method and (ii) quantum
                 mechanical (QM) calculation based on the density
                 functional theory (DFT), so that accurate but less
                 scalable computations are performed only where they are
                 needed. The multiscale MD/QM simulation code has been
                 Grid-enabled using (i) a modular, additive
                 hybridization scheme, (ii) multiple QM clustering, and
                 (iii) computation/communication overlapping. The
                 Gridified MD/QM simulation code has been used to study
                 environmental effects of water molecules on fracture in
                 silicon. A preliminary run of the code has achieved a
                 parallel efficiency of 94\% on 25 PCs distributed over
                 3 PC clusters in the US and Japan, and a larger test
                 involving 154 processors on 5 distributed PC clusters
                 is in progress.",
  acknowledgement = ack-nhfb,
  keywords =     "Grid application; multiscale simulation; molecular
                 dynamics;quantum mechanics; density functional theory",
}

@InProceedings{Baldridge:2002:QGI,
  author =       "Kim K. Baldridge and Jerry P. Greenberg and Stephen T.
                 Elbert and Stephen Mock and Philip Papadopoulos",
  title =        "{QMView} and {GAMESS}: Integration into the {World
                 Wide Computational Grid}",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap141.pdf",
  abstract =     "High performance computing, storage, visualization,
                 and database infrastructures are increasing
                 geometrically in complexity as scientists move towards
                 grid-based computing. While this is natural, it has the
                 effect of pushing computational capabilities beyond the
                 reach of scientists because of the time needed to
                 harness the infrastructure. Hiding the complexity of
                 networked resources becomes essential if scientists are
                 to utilize them effectively. In this work, we describe
                 our efforts to integrate various computational
                 chemistry components into a scientific computing
                 environment. We briefly describe improvements we have
                 made to individual components of the chemistry
                 environment as well as future directions, followed by a
                 more in-depth discussion of our strategy for
                 integration into a grid workflow environment based on
                 web services, which enables access to remote resources
                 while shielding users from the complexities of the grid
                 infrastructures. A preliminary schema for storing data
                 obtained from computational chemistry calculations is
                 also described.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Bustamante:2002:SDS,
  author =       "Fabian E. Bustamante and Patrick Widener and Karsten
                 Schwan",
  title =        "Scalable Directory Services Using Proactivity",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap202.pdf",
  abstract =     "Common to computational grids and pervasive computing
                 is the need for an expressive, efficient, and scalable
                 directory service that provides information about
                 objects in the environment. We argue that a directory
                 interface that `pushes' information to clients about
                 changes to objects can significantly improve
                 scalability. This paper describes the design,
                 implementation, and evaluation of the Proactive
                 Directory Service (PDS). PDS' interface supports a
                 customizable `proactive' mode through which clients can
                 subscribe to be notified about changes to their objects
                 of interest. Clients can dynamically tune the detail
                 and granularity of these notifications through filter
                 functions instantiated at the server or at the object's
                 owner, and by remotely tuning the functionality of
                 those filters. We compare PDS' performance against
                 off-the-shelf implementations of DNS and the
                 Lightweight Directory Access Protocol. Our evaluation
                 results confirm the expected performance advantages of
                 this approach and demonstrate that customized
                 notification through filter functions can reduce
                 bandwidth utilization while improving the performance
                 of both clients and directory servers.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Lee:2002:MDA,
  author =       "Jason Lee and Dan Gunter and Martin Stoufer and Brian
                 Tierney",
  title =        "Monitoring Data Archives for Grid Environments",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap234.pdf",
  abstract =     "Developers and users of high-performance distributed
                 systems often observe performance problems such as
                 unexpectedly low throughput or high latency. To
                 determine the source of these performance problems,
                 detailed end-to-end monitoring data from applications,
                 networks, operating systems, and hardware must be
                 correlated across time and space. Researchers need to
                 be able to view and compare this very detailed
                 monitoring data from a variety of angles. To address
                 this problem, we propose a relational monitoring data
                 archive that is designed to efficiently handle
                 high-volume streams of monitoring data. In this paper
                 we present an instrumentation and monitoring event
                 archive service that can be used to collect and
                 aggregate detailed end-to-end monitoring information
                 from distributed applications. This archive service is
                 designed to be scalable and fault tolerant. We also
                 show how the archive is based on the ``Grid Monitoring
                 Architecture''' defined by the Global Grid Forum.",
  acknowledgement = ack-nhfb,
}

@InProceedings{Mazzucco:2002:MMD,
  author =       "Marco Mazzucco and Asvin Ananthanarayan and Robert L.
                 Grossman and Jorge Levera and Gokulnath B. Rao",
  title =        "Merging Multiple Data Streams on Common Keys Over High
                 Performance Networks",
  crossref =     "IEEE:2002:STI",
  pages =        "??--??",
  year =         "2002",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-2002.org/paperpdfs/pap.pap213.pdf",
  abstract =     "The model for data mining on streaming data assumes
                 that there is a buffer of fixed length and a data
                 stream of infinite length and the challenge is to
                 extract patterns, changes, anomalies, and statistically
                 significant structures by examining the data one time
                 and storing records and derived attributes of length
                 less than . As data grids, data webs, and semantic webs
                 become more common, mining distributed streaming data
                 will become more and more important. The first step
                 when presented with two or more distributed streams is
                 to merge them using a common key. In this paper, we
                 present two algorithms for merging streaming data using
                 a common key. We also present experimental studies
                 showing these algorithms scale in practice to OC-12
                 networks.",
  acknowledgement = ack-nhfb,
}

%%% ====================================================================
%%% Cross-referenced entries must come last:

@Proceedings{IEEE:2002:STI,
  editor =       "{IEEE}",
  booktitle =    "{SC2002}: From Terabytes to Insight. Proceedings of
                 the {IEEE ACM SC 2002 Conference, November 16--22,
                 2002, Baltimore, MD, USA}",
  title =        "{SC2002}: From Terabytes to Insight. Proceedings of
                 the {IEEE ACM SC 2002 Conference, November 16--22,
                 2002, Baltimore, MD, USA}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "????",
  year =         "2002",
  ISBN =         "0-7695-1524-X",
  ISBN-13 =      "978-0-7695-1524-3",
  LCCN =         "????",
  bibdate =      "Thu Feb 21 18:29:36 2002",
  acknowledgement = ack-nhfb,
}