%%% -*-BibTeX-*-
%%% ====================================================================
%%% BibTeX-file{
%%% author = "Nelson H. F. Beebe",
%%% version = "1.03",
%%% date = "25 October 2010",
%%% time = "17:23:38 MDT",
%%% filename = "supercomputing2002.bib",
%%% University of Utah
%%% Department of Mathematics, 110 LCB
%%% 155 S 1400 E RM 233
%%% Salt Lake City, UT 84112-0090
%%% USA",
%%% telephone = "+1 801 581 5254",
%%% FAX = "+1 801 581 4148",
%%% URL = "http://www.math.utah.edu/~beebe",
%%% checksum = "15932 2233 13182 122527",
%%% email = "beebe at math.utah.edu, beebe at acm.org,
%%% beebe at computer.org (Internet)",
%%% codetable = "ISO/ASCII",
%%% keywords = "BibTeX, bibliography, SC2002, Supercomputing
%%% 2002",
%%% license = "public domain",
%%% supported = "yes",
%%% docstring = "This is a complete bibliography of papers
%%% published in the proceedings of
%%% Supercomputing '2002.
%%%
%%% The conference World-Wide Web site is
%%%
%%% http://www.sc-2002.org/
%%%
%%% The organizers of this conference series
%%% maintain a World-Wide Web site at
%%%
%%% http://www.supercomp.org/
%%%
%%% where pointers to Web pages for the
%%% conferences from 1988 to date may be found.
%%%
%%% At version 1.03, the year coverage looked
%%% like this:
%%%
%%% 2002 ( 68)
%%%
%%% InProceedings: 67
%%% Proceedings: 1
%%%
%%% Total entries: 68
%%%
%%% In this bibliography, entries are sorted in
%%% order of PDF file numbers.
%%%
%%% The on-line electronic proceedings do not
%%% contain sequential page numbers, although
%%% there is an ISBN assigned for the
%%% proceedings. A pagecount field is given with
%%% each entry, extracted from the PDF file: some
%%% of the articles lack page numbers altogether,
%%% others number pages 1, 2, 3, ...
%%%
%%% The checksum field above contains a CRC-16
%%% checksum as the first value, followed by the
%%% equivalent of the standard UNIX wc (word
%%% count) utility output of lines, words, and
%%% characters. This is produced by Robert
%%% Solovay's checksum utility.",
%%% }
%%% ====================================================================
@Preamble{
"\ifx \undefined \TM \def \TM {${}^{\sc TM}$} \fi"
}
%%% ====================================================================
%%% Acknowledgement abbreviations:
@String{ack-nhfb = "Nelson H. F. Beebe,
University of Utah,
Department of Mathematics, 110 LCB,
155 S 1400 E RM 233,
Salt Lake City, UT 84112-0090, USA,
Tel: +1 801 581 5254,
FAX: +1 801 581 4148,
e-mail: \path|beebe@math.utah.edu|,
\path|beebe@acm.org|,
\path|beebe@computer.org| (Internet),
URL: \path|http://www.math.utah.edu/~beebe/|"}
%%% ====================================================================
%%% Publishers and their addresses:
@String{pub-IEEE = "IEEE Computer Society Press"}
@String{pub-IEEE:adr = "1109 Spring Street, Suite 300,
Silver Spring, MD 20910, USA"}
%%% ====================================================================
%%% Bibliography entries.
@InProceedings{DeRose:2002:SSI,
author = "Luiz DeRose and K. Ekanadham and Jeffrey Hollingsworth
and Simone Sbaraglia",
title = "{SIGMA}: {A} Simulator Infrastructure to Guide Memory
Analysis",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap191.pdf",
abstract = "In this paper we present SIGMA (Simulation
Infrastructure to Guide Memory Analysis), a new data
collection framework and family of cache analysis
tools. The SIGMA environment provides detailed cache
information by gathering memory reference data using
software-based instrumentation. This infrastructure can
facilitate quick probing into the factors that
influence the performance of an application by
highlighting bottleneck scenarios including: excessive
cache/TLB misses and inefficient data layouts. The tool
can also assist in perturbation analysis to determine
performance variations caused by changes to
architecture or program. Our validation tests using the
SPEC Swim benchmark show that most of the performance
metrics obtained with SIGMA are within 1\% of the
metrics obtained with hardware performance counters,
with the advantage that SIGMA provides performance data
on a data structure level, as specified by the
programmer.",
acknowledgement = ack-nhfb,
}
@InProceedings{Lu:2002:CAS,
author = "Charng-da Lu and Daniel A. Reed",
title = "Compact Application Signatures for Parallel and
Distributed Scientific Codes",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap244.pdf",
abstract = "Understanding the dynamic behavior of parallel
programs is key to developing efficient system software
and runtime environments; this is even more true on
emerging computational Grids where resource
availability and performance can change in
unpredictable ways. Event tracing provides details on
behavioral dynamics, albeit often at great cost. We
describe an intermediate approach, based on curve
fitting, that retains many of the advantages of event
tracing but with lower overhead. These compact
``application signatures'' summarize the time-varying
resource needs of scientific codes from historical
trace data. We also developed a comparison scheme that
measures similarity between two signatures, both across
executions and across execution environments.",
acknowledgement = ack-nhfb,
}
@InProceedings{Ahn:2002:SAT,
author = "Dong H. Ahn and Jeffrey S. Vetter",
title = "Scalable Analysis Techniques for Microprocessor
Performance Counter Metrics",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap257.pdf",
abstract = "Contemporary microprocessors provide a rich set of
integrated performance counters that allow application
developers and system architects alike the opportunity
to gather important information about workload
behaviors. Current techniques for analyzing data
produced from these counters use raw counts, ratios,
and visualization techniques help users make decisions
about their application performance. While these
techniques are appropriate for analyzing data from one
process, they do not scale easily to new levels
demanded by contemporary computing systems. Very
simply, this paper addresses these concerns by
evaluating several multivariate statistical techniques
on these datasets. We find that several techniques,
such as statistical clustering, can automatically
extract important features from the data. These derived
results can, in turn, be fed directly back to an
application developer, or used as input to a more
comprehensive performance analysis environment, such as
a visualization or an expert system.",
acknowledgement = ack-nhfb,
}
@InProceedings{Bailey:2002:HPC,
author = "David H. Bailey and David Broadhurst and Yozo Hida and
Xiaoye S. Li and Brandon Thompson",
title = "High Performance Computing Meets Experimental
Mathematics",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Fri Aug 08 11:13:32 2008",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap124.pdf",
abstract = "In this paper we describe some novel applications of
high performance computing in a discipline now known as
experimental mathematics. The paper reviews some recent
published work, and then presents some new results that
have not yet appeared in the literature. A key
technique involved in this research is the PSLQ integer
relation algorithm (recently named one of ten
algorithms of the century by Computing in Science and
Engineering). This algorithm permits one to recognize a
numeric constant in terms of the formula that it
satisfies. We present a variant of PSLQ that is
well-suited for parallel computation, and give several
examples of new mathematical results that we have found
using it. Two of these computations were performed on
highly parallel computers, since they are not feasible
on conventional systems. We also describe a new
software package for performing arbitrary precision
arithmetic, which is required in this research.",
acknowledgement = ack-nhfb,
}
@InProceedings{Baumgartner:2002:HLA,
author = "Gerald Baumgartner and David E. Bernholdt and Daniel
Cociorva and Robert Harrison and So Hirata and
Chi-Chung Lam and Marcel Nooijen and Russell Pitzer and
J. Ramanujam and P. Sadayappan",
title = "A High-Level Approach to Synthesis of High-Performance
Codes for Quantum Chemistry",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap306.pdf",
abstract = "This paper discusses an approach to the synthesis of
high-performance parallel programs for a class of
computations encountered in quantum chemistry and
physics. These computations are expressible as a set of
tensor contractions and arise in electronic structure
modeling. An overview is provided of the synthesis
system, that transforms a high-level specification of
the computation into high-performance parallel code,
tailored to the characteristics of the target
architecture. An example from computational chemistry
is used to illustrate how different code structures are
generated under different assumptions of available
memory on the target computer.",
acknowledgement = ack-nhfb,
}
@InProceedings{Ding:2002:MOP,
author = "Yun He and Chris H. Q. Ding",
key = "multidimensional arrays; index reshuffle; vacancy
tracking cycles; global exchange; dynamical remapping;
MPI; OpenMP; hybrid MPI/OpenMP; SMP cluster.",
title = "{MPI} and {OpenMP} Paradigms on Cluster of {SMP}
Architectures",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap325.pdf",
abstract = "We investigate remapping multi-dimensional arrays on
cluster of SMP architectures under OpenMP, MPI, and
hybrid paradigms. Traditional method of array transpose
needs an auxiliary array of the same size and a copy
back stage. We recently developed an in-place method
using vacancy tracking cycles. The vacancy tracking
algorithm outperforms the traditional 2-array method as
demonstrated by extensive comparisons. The independence
of vacancy tracking cycles allows efficient
parallelization of the in-place method on SMP
architectures at node level. Performance of
multi-threaded parallelism using OpenMP are tested with
different scheduling methods and different number of
threads. The vacancy tracking method is parallelized
using several parallel paradigms. At node level, pure
OpenMP outperforms pure MPI by a factor of 2.76. Across
entire cluster of SMP nodes, the hybrid MPI/OpenMP
implementation outperforms pure MPI by a factor of
4.44, demonstrating the validity of the parallel
paradigm of mixing MPI with OpenMP.",
acknowledgement = ack-nhfb,
}
@InProceedings{Hacker:2002:ESP,
author = "Thomas J. Hacker and Brian D. Noble and Brian D.
Athey",
title = "The Effects of Systemic Packet Loss on Aggregate {TCP}
Flows",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap270.pdf",
abstract = "The use of parallel TCP connections to increase
throughput for bulk transfers is common practice within
the high performance computing community. However, the
effectiveness, fairness, and efficiency of data
transfers across parallel connections is unclear. This
paper considers the impact of systemic non-congestion
related packet loss on the effectiveness, fairness, and
efficiency of parallel TCP transmissions. The results
indicate that parallel connections are effective at
increasing aggregate throughput, and increase the
overall efficiency of the network bottleneck. In the
presence of congestion related losses, parallel flows
steal bandwidth from other single stream flows. A
simple modification is presented that reduces the
fairness problems when congestion is present, but
retains effectiveness and efficiency.",
acknowledgement = ack-nhfb,
}
@InProceedings{Pradhan:2002:IEQ,
author = "Prashant Pradhan and Tzi-cker Chiueh",
title = "Implementation and Evaluation of a {QoS}-Capable
Cluster-Based {IP} Router",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap320.pdf",
abstract = "A major challenge in Internet edge router design is to
support both high packet forwarding performance and
versatile and efficient packet processing capabilities.
The thesis of this research project is that a cluster
of PCs connected by a high-speed system area network
provides an effective hardware platform for building
routers to be used at the edges of the Internet. This
paper describes a scalable and extensible edge router
architecture called Panama, which supports a novel
aggregate route caching scheme, a real-time link
scheduling algorithm whose performance overhead is
independent of the number of real-time flows, a highly
efficient kernel extension mechanism to safely load
networking software extensions dynamically, and an
integrated resource scheduler which ensures that
real-time flows with additional packet processing
requirements still meet their end-to-end performance
requirements. This paper describes the implementation
and evaluation of the first Panama prototype based on a
cluster of PCs and Myrinet.",
acknowledgement = ack-nhfb,
}
@InProceedings{Dunigan:2002:TTD,
author = "Tom Dunigan and Matt Mathis and Brian Tierney",
title = "A {TCP} Tuning Daemon",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap151.pdf",
abstract = "Many high performance distributed applications require
high network throughput but are able to achieve only a
small fraction of the available bandwidth. A common
cause of this problem is improperly tuned network
settings. Tuning techniques, such as setting the
correct TCP buffers and using parallel streams, are
well known in the networking community, but outside the
networking community they are infrequently applied. In
this paper, we describe a tuning daemon that uses TCP
instrumentation data from the Unix kernel to
transparently tune TCP parameters for specified
individual flows over designated paths. No
modifications are required to the application, and the
user does not need to understand network or TCP
characteristics.",
acknowledgement = ack-nhfb,
keywords = "autotuning; TCP; high-performance networking; data
grids",
}
@InProceedings{Malard:2002:DDH,
author = "J. M. Malard and R. D. Stewart",
title = "Distributed Dynamic Hash Tables Using {IBM LAPI}",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap247.pdf",
abstract = "An asynchronous communication library for accessing
and managing dynamic hash tables over a network of
Symmetric Multiprocessors (SMP) is presented. A
blocking factor is shown experimentally to reduce the
variance of the wall clock time. It is also shown that
remote accesses to a distributed hash table can be as
effective and scalable as the one-sided operations of
the low-level communication middleware on an IBM SP.",
acknowledgement = ack-nhfb,
}
@InProceedings{Swany:2002:MRP,
author = "Martin Swany and Rich Wolski",
title = "Multivariate Resource Performance Forecasting in the
{Network Weather Service}",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap292.pdf",
abstract = "This paper describes a new technique in the Network
Weather Service for producing multi-variate forecasts.
The new technique uses the NWS's univariate forecasters
and empirically gathered Cumulative Distribution
Functions (CDFs) to make predictions from correlated
measurement streams. Experimental results are shown in
which throughput is predicted for long TCP/IP transfers
from short NWS network probes.",
acknowledgement = ack-nhfb,
}
@InProceedings{Otoo:2002:DCR,
author = "Ekow J. Otoo and Frank Olken and Arie Shoshani",
title = "Disk Cache Replacement Algorithm for Storage Resource
Managers in Data Grids",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap322.pdf",
abstract = "We address the problem of cache replacement policies
for Storage Resource Managers (SRMs) that are used in
Data Grids. An SRM has a disk storage of bounded
capacity that retains some N objects. A replacement
policy is applied to determine which object in the
cache needs to be evicted when space is needed. We
define a utility function for ranking the candidate
objects for eviction and then describe an efficient
algorithm for computing the replacement policy based on
this function. This computation takes time $O(\log N)$.
We compare our policy with traditional replacement
policies such as Least Frequently Used (LFU), Least
Recently Used (LRU), LRU-K, Greedy Dual Size (GDS),
etc., using simulations of both synthetic and real
workloads of file accesses to tertiary storage. Our
simulations of replacement policies account for delays
in cache space reservation, data transfer and
processing. The results obtained show that our proposed
method is the most cost effective cache replacement
policy for Storage Resource Managers (SRM).",
acknowledgement = ack-nhfb,
keywords = "file caching; cache replacement algorithm;
trace-driven simulation; data staging; storage resource
management",
}
@InProceedings{Radovic:2002:ESN,
author = "Zoran Radovic and Erik Hagersten",
title = "Efficient Synchronization for Nonuniform Communication
Architectures",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap221.pdf",
abstract = "Scalable parallel computers are often nonuniform
communication architectures (NUCAs), where the access
time to other processor's caches vary with their
physical location. Still, few attempts of exploring
cache-to-cache communication locality have been made.
This paper introduces a new kind of synchronization
primitives (lock-unlock) that favor neighboring
processors when a lock is released. This improves the
lock handover time as well as access time to the shared
data of the critical region. A critical section guarded
by our new RH lock takes less than half the time to
execute compared with the same critical section guarded
by any other lock on our NUCA hardware. The execution
time for Raytrace with 28 processors was improved
2.23--4.68 times, while global traffic was dramatically
decreased compared with all the other locks. The
average execution time was improved 7--24\% while the
global traffic was decreased 8--28\% for an average
over the seven applications studied.",
acknowledgement = ack-nhfb,
}
@InProceedings{Sistare:2002:UHP,
author = "Steven J. Sistare and Christopher J. Jackson",
title = "Ultra-High Performance Communication with {MPI} and
the {Sun Fire(\TM)} Link Interconnect",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap142.pdf",
abstract = "We present a new low-latency system area network that
provides the ultra-high bandwidth needed to fuse a
collection of large SMP servers into a capability
cluster. The network adapter exports a remote shared
memory (RSM) model that supports low latency kernel
bypass messaging. The Sun\TM{} MPI library uses the RSM
interface to implement a highly efficient
memory-to-memory messaging protocol in which the
library directly manages buffers and data structures in
remote memory. This allows flexible allocation of
buffer space to active connections, while avoiding
resource contention that could otherwise increase
latencies. We discuss the characteristics of the
interconnect, describe the MPI protocols, and measure
the performance of a number of MPI benchmarks. Our
results include MPI inter-node bandwidths of almost 3
Gigabytes per second and MPI ping-pong latencies as low
as 3.7 microseconds.",
acknowledgement = ack-nhfb,
keywords = "interconnects; MPI; kernel bypass; remote shared
memory; SAN; performance evaluation",
}
@InProceedings{Eberle:2002:SHB,
author = "Hans Eberle and Nils Gura",
title = "Separated High-bandwidth and Low-latency Communication
in the Cluster Interconnect {Clint}",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap259.pdf",
abstract = "An interconnect for a high-performance cluster has to
be optimized in respect to both high throughput and low
latency. To avoid the tradeoff between throughput and
latency, the cluster interconnect Clint has a
segregated architecture that provides two physically
separate transmission channels: A bulk channel
optimized for high-bandwidth traffic and a quick
channel optimized for low-latency traffic. Different
scheduling strategies are applied. The bulk channel
uses a scheduler that globally allocates time slots on
the transmission paths before packets are sent off.
This way collisions as well as blockages are avoided.
In contrast, the quick channel takes a best-effort
approach by sending packets whenever they are available
thereby risking collisions and
retransmissions.\par
Simulation results clearly show the performance
advantages of the segregated architecture. The
carefully scheduled bulk channel can be loaded nearly
to its full capacity without exhibiting head-of-line
blocking that limits many networks while the quick
channel provides low-latency communication even in the
presence of high-bandwidth traffic.",
acknowledgement = ack-nhfb,
}
@InProceedings{Vetter:2002:EPE,
author = "Jeffrey S. Vetter and Andy Yoo",
title = "An Empirical Performance Evaluation of Scalable
Scientific Applications",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap222.pdf",
abstract = "We investigate the scalability, architectural
requirements, and performance characteristics of eight
scalable scientific applications. Our analysis is
driven by empirical measurements using statistical and
tracing instrumentation for both communication and
computation. Based on these measurements, we refine our
analysis into precise explanations of the factors that
influence performance and scalability for each
application; we distill these factors into common
traits and overall recommendations for both users and
designers of scalable platforms. Our experiments
demonstrate that some traits, such as improvements in
the scaling and performance of MPI's collective
operations, will benefit most applications. We also
find specific characteristics of some applications that
limit performance. For example, one application's
intensive use of a 64-bit, floating-point divide
instruction, which has high latency and is not
pipelined on the POWER3, limits the performance of the
application's primary computation.",
acknowledgement = ack-nhfb,
}
@InProceedings{El-Ghazawi:2002:UPP,
author = "Tarek El-Ghazawi and Fran{\c{c}}ois Cantonnet",
title = "{UPC} Performance and Potential: {A} {NPB}
Experimental Study",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap316.pdf",
abstract = "UPC, or Unified Parallel C, is a parallel extension of
ANSI C. UPC follows a distributed shared memory
programming model aimed at leveraging the ease of
programming of the shared memory paradigm, while
enabling the exploitation of data locality. UPC
incorporates constructs that allow placing data near
the threads that manipulate them to minimize remote
accesses. This paper gives an overview of the concepts
and features of UPC and establishes, through extensive
performance measurements of NPB workloads, the
viability of the UPC programming language compared to
the other popular paradigms. Further, through
performance measurements we identify the challenges,
the remaining steps and the priorities for UPC. It will
be shown that with proper hand tuning libraries, UPC
performance will be comparable incorporating such
improvements into automatic compare quite favorably to
message passing in ease and optimized collective
operations to that of MPI. Furthermore, by compiler
optimizations, UPC will of programming.",
acknowledgement = ack-nhfb,
keywords = "NPB (NAS Parallel Benchmark)",
}
@InProceedings{Worley:2002:SUC,
author = "Patrick H. Worley",
title = "Scaling the Unscalable: {A} Case Study on the
{AlphaServer SC}",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap155.pdf",
abstract = "A case study of the optimization of a climate modeling
application on the Compaq AlphaServer SC at the
Pittsburgh Supercomputer Center is used to illustrate
tools and techniques that are important to achieving
good performance scaling.",
acknowledgement = ack-nhfb,
}
@InProceedings{Schussman:2002:AVT,
author = "Greg Schussman and Brett Wilson and Kwok Ko and Ji
Qiang and Robert Ryne and Kwan-Liu Ma",
title = "Advanced Visualization Technology for Terascale
Particle Accelerator Simulations",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap224.pdf",
abstract = "This paper presents two new hardware-assisted
rendering techniques developed for interactive
visualization of the terascale data generated from
numerical modeling of next-generation accelerator
designs. The first technique, based on a hybrid
rendering approach, makes possible interactive
exploration of large-scale particle data from particle
beam dynamics modeling. The second technique, based on
a compact texture-enhanced representation, exploits the
advanced features of commodity graphics cards to
achieve perceptually effective visualization of the
very dense and complex electromagnetic fields produced
from the modeling of reflection and transmission
properties of open structures in an accelerator design.
Because of the collaborative nature of the overall
accelerator modeling project, the visualization
technology developed is for both desktop and remote
visualization settings. We have tested the techniques
using both time-varying particle data sets containing
up to one billion particles per time step and
electromagnetic field data sets with millions of mesh
elements.",
acknowledgement = ack-nhfb,
keywords = "hardware-assisted techniques; high-performance
computing; particle accelerators; perception;
point-based rendering; scientific visualization; field
lines; texture mapping; time-varying data; vector field
visualization; visual cues; volume rendering",
}
@InProceedings{Wolf:2002:SPS,
author = "Matthew Wolf and Zhongtang Cai and Weiyun Huang and
Karsten Schwan",
title = "{SmartPointers}: Personalized Scientific Data Portals
In Your Hand",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap304.pdf",
abstract = "The SmartPointer system provides a paradigm for
utilizing multiple light-weight client endpoints in a
real-time scientific visualization infrastructure.
Together, the client and server infrastructure form a
new type of data portal for scientific computing. The
clients can be used to personalize data for the needs
of the individual scientist. This personalization of a
shared dataset is designed to allow multiple
scientists, each with their laptops or iPaqs to explore
the dataset from different angles and with different
personalized filters. As an example, iPaq clients can
display 2D derived data functions which can be used to
dynamically update and annotate the shared data space,
which might be visualized separately on a large
immersive display such as a CAVE. Measurements are
presented for such a system, built upon the ECho
middleware system developed at Georgia Tech.",
acknowledgement = ack-nhfb,
}
@InProceedings{Snavely:2002:FPM,
author = "Allan Snavely and Laura Carrington and Nicole Wolter
and Jesus Labarta and Rosa Badia and Avi Purkayastha",
title = "A Framework for Performance Modeling and Prediction",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap201.pdf",
abstract = "Cycle-accurate simulation is far too slow for modeling
the expected performance of full parallel applications
on large HPC systems. And just running an application
on a system and observing wallclock time tells you
nothing about why the application performs as it does
(and is anyway impossible on yet-to-be-built systems).
Here we present a framework for performance modeling
and prediction that is faster than cycle-accurate
simulation, more informative than simple benchmarking,
and is shown useful for performance investigations in
several dimensions.",
acknowledgement = ack-nhfb,
}
@InProceedings{Gopalan:2002:IRL,
author = "Kartik Gopalan and Tzi-cker Chiueh",
title = "Improving Route Lookup Performance Using Network
Processor Cache",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap272.pdf",
abstract = "Earlier research has shown that the route lookup
performance of a network processor can be significantly
improved by caching ranges of lookup/classification
keys rather than individual keys. While the previous
work focused specifically on reducing capacity misses,
we address two other important aspects --- (a) reducing
conflict misses and (b) cache consistency during
frequent route updates. We propose two techniques to
minimize conflict misses that aim to balance the number
of cacheable entries mapped to each cache set. They
offer different tradeoffs between performance and
simplicity while improving the average route lookup
time by 76\% and 45.2\% respectively. To maintain cache
consistency during frequent route updates, we propose a
selective cache invalidation technique that can limit
the degradation in lookup latency to within 10.2\%. Our
results indicate potentially large improvement in
lookup performance for network processors used at
Internet edge and motivate further research into
caching at the Internet core.",
acknowledgement = ack-nhfb,
}
@InProceedings{Athanasaki:2002:PST,
author = "Maria Athanasaki and Aristidis Sotiropoulos and
Georgios Tsoukalas and Nectarios Koziris",
title = "Pipelined Scheduling of Tiled Nested Loops onto
Clusters of {SMP}s using Memory Mapped Network
Interfaces",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap132.pdf",
abstract = "This paper describes the performance benefits attained
using enhanced network interfaces to achieve low
latency communication. We present a novel, pipelined
scheduling approach which takes advantage of DMA
communication mode, to send data to other nodes, while
the CPUs are performing calculations. We also use
zero-copy communication through pinned-down physical
memory regions, provided by NIC's driver modules. Our
testbed concerns the parallel execution of tiled nested
loops onto a cluster of SMP nodes with single PCI-SCI
NICs inside each node. In order to schedule tiles, we
apply a hyperplane-based grouping transformation to the
tiled space, so as to group together independent
neighboring tiles and assign them to the same SMP node.
Experimental evaluation illustrates that memory mapped
NICs with enhanced communication features enable the
use of a more advanced pipelined (overlapping)
schedule, which considerably improves performance,
compared to an ordinary blocking schedule, implemented
with conventional, CPU and kernel bounded,
communication primitives.",
acknowledgement = ack-nhfb,
keywords = "memory mapped network interfaces; DMA; pipelined
schedules; tile grouping; communication overlapping;
SMPs",
}
@InProceedings{Hiraki:2002:DRU,
author = "Kei Hiraki and Mary Inaba and Junji Tamatsukuri and
Ryutaro Kurusu and Yukichi Ikuta and Hisashi Koga and
Akira Zinzaki",
title = "Data Reservoir: Utilization of Multi-Gigabit Backbone
Network for Data-Intensive Research",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap327.pdf",
abstract = "We propose data sharing facility for data intensive
scientific research, ``Data Reservoir''; which is
optimized to transfer huge amount of data files between
distant places fully utilizing multi-gigabit backbone
network. In addition, ``Data Reservoir'' can be used as
an ordinary UNIX server in local network without any
modification of server software. We use low-level
protocol and hierarchical striping to realize (1)
separation of bulk data transfer and local accesses by
caching, (2) file-system transparency, i.e.,
interoperable whatever in higher layer than disk
driver, including file system. (3) scalability for
network and storage. This paper shows our design,
implementation using iSCSI protocol [1] and their
performances for both 1Gbps model in the real network
and 10Gbps model in our laboratory.",
acknowledgement = ack-nhfb,
}
@InProceedings{Li:2002:NSA,
author = "Laura Grigori and Xiaoye S. Li",
title = "A New Scheduling Algorithm For Parallel Sparse {LU}
Factorization with Static Pivoting",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap139.pdf",
abstract = "In this paper we present a static scheduling algorithm
for parallel sparse LU factorization with static
pivoting. The algorithm is divided into mapping and
scheduling phases, using the symmetric pruned graphs of
LT and U to represent dependencies. The scheduling
algorithm is designed for driving the parallel
execution of the factorization on a distributed-memory
architecture. Experimental results and comparisons with
SuperLU DIST are reported after applying this algorithm
on real world application matrices on an IBM SP RS/6000
distributed memory machine.",
acknowledgement = ack-nhfb,
}
@InProceedings{Vuduc:2002:POB,
author = "Richard Vuduc and James W. Demmel and Katherine A.
Yelick and Shoaib Kamil and Rajesh Nishtala and
Benjamin Lee",
title = "Performance Optimizations and Bounds for Sparse
Matrix-Vector Multiply",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap317.pdf",
abstract = "We consider performance tuning, by code and data
structure reorganization, of sparse matrix-vector
multiply (SpMxV), one of the most important
computational kernels in scientific applications. This
paper addresses the fundamental questions of what
limits exist on such performance tuning, and how
closely tuned code approaches these limits.
Specifically, we develop upper and lower bounds on the
performance (Mflop/s) of SpMxV when tuned using our
previously proposed register blocking optimization.
These bounds are based on the non-zero pattern in the
matrix and the cost of basic memory operations, such as
cache hits and misses. We evaluate our tuned
implementations with respect to these bounds using
hardware counter data on 4 different platforms and on a
test set of 44 sparse matrices. We find that we can
often get within 20\% of the upper bound, particularly
on a class of matrices from finite element modeling
(FEM) problems; on non-FEM matrices, performance
improvements of $2\times$ are still possible. Lastly,
we present a new heuristic that selects optimal or
near-optimal register block sizes (the key tuning
parameters) more accurately than our previous
heuristic. Using the new heuristic, we show
improvements in SpMxV performance (Mflop/s) by as much
as $2.5\times$ over an untuned implementation.
Collectively, our results suggest that future
performance improvements, beyond those that we have
already demonstrated for SpMxV, will come from two
sources: (1) consideration of higher-level matrix
structures (e.g., exploiting symmetry, matrix
reordering, multiple register block sizes), and (2)
optimizing kernels with more opportunity for data reuse
(e.g., sparse matrix-multiple vector multiply,
multiplication of AT A by a vector).",
acknowledgement = ack-nhfb,
}
@InProceedings{Teranishi:2002:NDM,
author = "Keita Teranishi and Padma Raghavan and Esmond Ng",
title = "A New Data-Mapping Scheme For Latency-Tolerant
Distributed Sparse Triangular Solution",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap238.pdf",
abstract = "This paper concerns latency-tolerant schemes for the
efficient parallel solution of sparse triangular linear
systems on distributed memory multiprocessors. Such
triangular solution is required when sparse Cholesky
factors are used to solve for a sequence of
right-hand-side vectors or when incomplete sparse
Cholesky factors are used to precondition a Conjugate
Gradients iterative solver. In such applications, the
use of traditional distributed substitution schemes can
create a performance bottleneck when the latency of
interprocessor communication is large. We had earlier
developed the Selective Inversion (SI) scheme to reduce
communication latency costs by replacing distributed
substitution by parallel matrix vector multiplication.
We now present a new two-way mapping of the triangular
sparse matrix to processors to improve the performance
of SI by halving its communication latency costs. We
provide analytic results for model sparse matrices and
we report on the performance of our scheme for parallel
preconditioning with incomplete sparse Cholesky
factors.",
acknowledgement = ack-nhfb,
}
@InProceedings{Traff:2002:IMP,
author = "Jesper Larsson Traff",
title = "Implementing the {MPI} Process Topology Mechanism",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap122.pdf",
abstract = "The topology functionality of the Message Passing
Interface (MPI) provides a portable,
architecture-independent means for adapting application
programs to the communication architecture of the
target hardware. However, current MPI implementations
rarely go beyond the most trivial implementation, and
simply performs no process remapping. We discuss the
potential of the topology mechanism for systems with a
hierarchical communication architecture like clusters
of SMP nodes. The MPI topology functionality is a weak
mechanism, and we argue about some of its shortcomings.
We formulate the topology optimization problem as a
graph embedding problem, and show that for hierarchical
systems it can be solved by graph partitioning. We
state the properties of a new heuristic for solving
both the embedding problem and the ``easier'' graph
partitioning problem. The graph partitioning based
framework has been fully implemented in MPI/SX for the
NEC SX-series of parallel vector computers. MPI/SX is
thus one of very few MPI implementations with a
non-trivial topology functionality. On a 4 node NEC
SX-6 significant communication performance improvements
are achieved with synthetic MPI benchmarks.",
acknowledgement = ack-nhfb,
}
@InProceedings{Bosilca:2002:MVT,
author = "George Bosilca and Aurelien Bouteiller and Franck
Cappello and Samir Djilali and Gilles Fedak and Cecile
Germain and Thomas Herault and Pierre Lemarinier and
Oleg Lodygensky and Frederic Magniette and Vincent Neri
and Anton Selikhov",
title = "{MPICH-V}: Toward a Scalable Fault Tolerant {MPI} for
Volatile Nodes",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap298.pdf",
abstract = "Global Computing platforms, large scale clusters and
future TeraGRID systems gather thousands of nodes for
computing parallel scientific applications. At this
scale, node failures or disconnections are frequent
events. This Volatility reduces the MTBF of the whole
system in the range of hours or minutes. We present
MPICH-V, an automatic Volatility tolerant MPI
environment based on uncoordinated checkpoint/ rollback
and distributed message logging. MPICH-V architecture
relies on Channel Memories, Checkpoint servers and
theoretically proven protocols to execute existing or
new, SPMD and Master-Worker MPI applications on
volatile nodes. To evaluate its capabilities, we run
MPICH-V within a framework for which the number of
nodes, Channels Memories and Checkpoint Servers can be
completely configured as well as the node Volatility.
We present a detailed performance evaluation of every
component of MPICH-V and its global performance for
non-trivial parallel applications. Experimental results
demonstrate good scalability and high tolerance to node
volatility.",
acknowledgement = ack-nhfb,
}
@InProceedings{Chiu:2002:PMM,
author = "Kenneth Chiu and Madhusudhan Govindaraju and Dennis
Gannon",
title = "The {Proteus Multiprotocol Message Library}",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap315.pdf",
abstract = "Grid systems span manifold organizations and
application domains. Because this diverse environment
inevitably engenders multiple protocols,
interoperability mechanisms are crucial to seamless,
pervasive access. This paper presents the design,
rationale, and implementation of the Proteus
multiprotocol library for integrating multiple message
protocols, such as SOAP and JMS, within one system.
Proteus decouples application code from protocol code
at run-time, allowing clients to incorporate separately
developed protocols without recompiling or halting.
Through generic serialization, which separates the
transfer syntax from the message type, protocols can
also be added independently of serialization routines.
We also show performance-enhancing mechanisms for Grid
services that examine metadata, but pass actual data
through opaquely (such as adapters). The interface
provided to protocol implementors is general enough to
support protocols as disparate as our current
implementations: SOAP, JMS, and binary. Proteus is
written in C++; a Java port is planned.",
acknowledgement = ack-nhfb,
}
@InProceedings{Parello:2002:IAA,
author = "David Parello and Olivier Temam and Jean-Marie
Verdun",
title = "On Increasing Architecture Awareness in Program
Optimizations to Bridge the Gap between Peak and
Sustained Processor Performance -- Matrix-Multiply
Revisited",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap107.pdf",
abstract = "As the complexity of processor architectures
increases, there is a widening gap between peak
processor performance and sustained processor
performance so that programs now tend to exploit only a
fraction of available performance. While there is a
tremendous amount of literature on program
optimizations, compiler optimizations lack efficiency
because they are plagued by three flaws: (1) they often
implicitly use simplified, if not simplistic, models of
processor architecture, (2) they usually focus on a
single processor component (e.g., cache) and ignore the
interactions among multiple components, (3) the most
heavily investigated components (e.g., caches)
sometimes have only a small impact on overall
performance. Through the in-depth analysis of a simple
program kernel, we want to show that understanding the
complex interactions between programs and the numerous
processor architecture components is both feasible and
critical to design efficient program optimizations.",
acknowledgement = ack-nhfb,
}
@InProceedings{Pike:2002:BTA,
author = "Geoff Pike and Paul N. Hilfinger",
title = "Better Tiling and Array Contraction for Compiling
Scientific Programs",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap260.pdf",
abstract = "Scientific programs often include multiple loops over
the same data; interleaving parts of different loops
may greatly improve performance. We exploit this in a
compiler for Titanium, a dialect of Java. Our compiler
combines reordering optimizations such as loop fusion
and tiling with storage optimizations such as array
contraction (eliminating or reducing the size of
temporary arrays). The programmers we have in mind are
willing to spend some time tuning their code and their
compiler parameters. Given that, and the difficulty in
statically selecting parameters such as tile sizes, it
makes sense to provide automatic parameter searching
alongside the compiler. Our strategy is to optimize
aggressively but to expose the compiler's decisions to
external control. We double or triple the performance
of Gauss--Seidel relaxation and multigrid (versus an
optimizing compiler without tiling and array
contraction), and we argue that ours is the best
compiler for that kind of program.",
acknowledgement = ack-nhfb,
}
@InProceedings{Vetter:2002:APE,
author = "Jeffrey S. Vetter and Patrick H. Worley",
title = "Asserting Performance Expectations",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap275.pdf",
abstract = "Traditional techniques for performance analysis
provide a means for extracting and analyzing raw
performance information from applications. Users then
compare this raw data to their performance expectations
for application constructs. This comparison can be
tedious for the scale of today's architectures and
software systems. To address this situation, we present
a methodology and prototype that allows users to assert
performance expectations explicitly in their source
code using performance assertions. As the application
executes, each performance assertion in the application
collects data implicitly to verify the assertion. By
allowing the user to specify a performance expectation
with individual code segments, the runtime system can
jettison raw data for measurements that pass their
expectation, while reacting to failures with a variety
of responses. We present several compelling uses of
performance assertions with our operational prototype,
including raising a performance exception, validating a
performance model, and adapting an algorithm
empirically at runtime.",
acknowledgement = ack-nhfb,
}
@InProceedings{Makino:2002:TSP,
author = "Junichiro Makino and Eiichiro Kokubo and Toshiyuki
Fukushige and Hiroshi Daisaka",
title = "A {29.5 Tflops} simulation of planetesimals in
{Uranus-Neptune} region on {GRAPE-6}",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap146.pdf",
abstract = "As an entry for the 2002 Gordon Bell performance
prize, we report the performance achieved on the
GRAPE-6 system for a simulation of the early evolution
of the protoplanet-planetesimal system of the
Uranus-Neptune region. GRAPE-6 is a special-purpose
computer for astrophysical N-body calculations. The
present configuration has 2048 custom pipeline chips,
each containing six pipeline processors for the
calculation of gravitational interactions between
particles. Its theoretical peak performance is 63.4
Tflops. The actual performance obtained was 29.5
Tflops, for a simulation of the early evolution of
outer Solar system with 1.8 million planetesimals and
two massive protoplanets.",
acknowledgement = ack-nhfb,
}
@InProceedings{Bhardwaj:2002:SSS,
author = "Manoj Bhardwaj and Kendall Pierson and Garth Reese and
Tim Walsh and David Day and Ken Alvin and James Peery
and Charbel Farhat and Michel Lesoinne",
title = "{Salinas}: {A} Scalable Software for High-Performance
Structural and Solid Mechanics Simulations",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap216.pdf",
abstract = "We present Salinas, a scalable implicit software
application for the finite element static and dynamic
analysis of complex structural real-world systems. This
relatively complete code and a long list of users
engineering software with more than 100,000 lines of
sustains 292.5 Gflop/s on 2,940 ASCI Red processors,
and 1.16 Tflop/s on 3,375 ASCI White processors.",
acknowledgement = ack-nhfb,
}
@InProceedings{Phillips:2002:NBS,
author = "James C. Phillips and Gengbin Zheng and Sameer Kumar
and Laxmikant V. Kal{\'e}",
title = "{NAMD}: Biomolecular Simulation on Thousands of
Processors",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap277.pdf",
abstract = "NAMD is a fully featured, production molecular
dynamics program for high performance simulation of
large biomolecular systems. We have previously, at
SC2000, presented scaling results for simulations with
cutoff electrostatics on up to 2048 processors of the
ASCI Red machine, achieved with an object-based hybrid
force and spatial decomposition scheme and an
aggressive measurement-based predictive load balancing
framework. We extend this work by demonstrating similar
scaling on the much faster processors of the PSC
Lemieux Alpha cluster, and for simulations employing
efficient (order N log N) particle mesh Ewald full
electrostatics. This unprecedented scalability in a
biomolecular simulation code has been attained through
latency tolerance, adaptation to multiprocessor nodes,
and the direct use of the Quadrics Elan library in
place of MPI by the Charm++/Converse parallel runtime
system.",
acknowledgement = ack-nhfb,
}
@InProceedings{Lee:2002:IOG,
author = "William Lee and Anthony Mayer and Steven Newhouse",
title = "{ICENI}: An {Open Grid Service Architecture}
Implemented with {Jini}",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap253.pdf",
abstract = "The move towards Service Grids, where services are
composed to meet the requirements of a user community
within constraints specified by the resource provider,
present many challenges to service provision and
description. To support our research activities in the
autonomous composition of services to form a Semantic
Service Grid we describe the adoption within ICENI of
web services to enable interoperability with the
recently proposed Open Grid Services Architecture.",
acknowledgement = ack-nhfb,
keywords = "Computational Grids; Web Services; Semantic Grid",
}
@InProceedings{Hoschek:2002:WSD,
author = "Wolfgang Hoschek",
title = "The {Web Service Discovery Architecture}",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap161.pdf",
abstract = "In this paper, we propose the Web Service Discovery
Architecture (WSDA). At runtime, Grid applications can
use this architecture to discover and adapt to remote
services. WSDA promotes an interoperable web service
discovery layer by defining appropriate services,
interfaces, operations and protocol bindings, based on
industry standards. It is unified because it subsumes
an array of disparate concepts, interfaces and
protocols under a single semi-transparent umbrella. It
is modular because it defines a small set of orthogonal
multipurpose communication primitives (building blocks)
for discovery. These primitives cover service
identification, service description retrieval, data
publication as well as minimal and powerful query
support. The architecture is open and flexible because
each primitive can be used, implemented, customized and
extended in many ways. It is powerful because the
individual primitives can be combined and plugged
together by specific clients and services to yield a
wide range of behaviors and emerging synergies.",
acknowledgement = ack-nhfb,
keywords = "WSDA (Web Service Discovery Architecture)",
}
@InProceedings{Pierce:2002:IWS,
author = "Marlon Pierce and Geoffrey Fox and Choonhan Youn and
Steve Mock and Kurt Mueller and Ozgur Balsoy",
title = "Interoperable {Web} Services for Computational
Portals",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap284.pdf",
abstract = "Computational web portals are designed to simplify
access to diverse sets of high performance computing
resources, typically through an interface to
computational Grid tools. An important shortcoming of
these portals is their lack of interoperable and
reusable services. This paper presents an overview of
research efforts undertaken by our group to build
interoperating portal services around a Web Services
model. We present a comprehensive view of an
interoperable portal architecture, beginning with core
portal services that can be used to build Application
Web Services, which in turn may be aggregated and
managed through portlet containers.",
acknowledgement = ack-nhfb,
}
@InProceedings{Stamatakis:2002:APM,
author = "Alexandros P. Stamatakis and Thomas Ludwig and Harald
Meier and Marty J. Wolf",
title = "Accelerating Parallel Maximum Likelihood-based
Phylogenetic Tree Calculations using Subtree Equality
Vectors",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap113.pdf",
abstract = "Heuristics for calculating phylogenetic trees for a
large sets of aligned rRNA sequences based on the
maximum likelihood method are computationally
expensive. The core of most parallel algorithms, which
accounts for the greatest part of computation time, is
the tree evaluation function,that calculates the
likelihood value for each tree topology. This paper
describes and uses Subtree Equality Vectors (SEVs) to
reduce the number of required floating point operations
during topology evaluation. We integrated our
optimizations into various sequential programs and into
parallel fastDNAml, one of the most common and
efficient parallel programs for calculating large
phylogenetic trees. Experimental results for our
parallel program, which renders exactly the same output
as parallel fastDNAml show global run time improvements
of 26\% to 65\%. The optimization scales best on
clusters of PCs, which also implies a substantial cost
saving factor for the determination of large trees.",
acknowledgement = ack-nhfb,
}
@InProceedings{Akcelik:2002:PMG,
author = "Volkan Akcelik and George Biros and Omar Ghattas",
title = "Parallel Multiscale {Gauss--Newton--Krylov} Methods for
Inverse Wave Propagation",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap330.pdf",
abstract = "One of the outstanding challenges of computational
science and engineering is large-scale nonlinear
parameter estimation of systems governed by partial
differential equations. These are known as inverse
problems, in contradistinction to the forward problems
that usually characterize large-scale simulation.
Inverse problems are significantly more difficult to
solve than forward problems, due to ill-posedness,
large dense ill-conditioned operators, multiple minima,
space-time coupling, and the need to solve the forward
problem repeatedly. We present a parallel algorithm for
inverse problems governed by time-dependent PDEs, and
scalability results for an inverse wave propagation
problem of determining the material field of an
acoustic medium. The difficulties mentioned above are
addressed through a combination of total variation
regularization, preconditioned matrix-free
Gauss--Newton--Krylov iteration, algorithmic
checkpointing, and multiscale continuation. We are able
to solve a synthetic inverse wave propagation problem
though a pelvic bone geometry involving 2.1 million
inversion parameters in 3 hours on 256 processors of
the Terascale Computing System at the Pittsburgh
Supercomputing Center.",
acknowledgement = ack-nhfb,
}
@InProceedings{Hariharan:2002:SPF,
author = "Bhanu Hariharan and Srinivas Aluru and Balasubramaniam
Shanker",
title = "A Scalable Parallel Fast Multipole Method for Analysis
of Scattering from Perfect Electrically Conducting
Surfaces",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap295.pdf",
abstract = "In this paper, we develop a parallel Fast Multipole
Method (FMM) based solution for computing the scattered
electromagnetic fields from a Perfect Electrically
Conducting (PEC) surface. The main contributions of
this work are the development of parallel algorithms
with the following characteristics: (1) provably
efficient worst-case run-time irrespective of the shape
of the scatterer, (2) communication efficiency, and (3)
guaranteed load balancing within a small constant
factor. We have developed a scalable, parallel code and
validated it against surfaces for which solution can be
computed analytically, and against serial software. The
efficiency and scalability of the code is demonstrated
with experimental results on an IBM xSeries cluster.
Though developed in the context of this particular
application, our algorithms can be used in other
applications involving parallel FMM.",
acknowledgement = ack-nhfb,
}
@InProceedings{Karniadakis:2002:DLP,
author = "Suchuan Dong and George Em. Karniadakis",
title = "Dual-Level Parallelism for Deterministic and
Stochastic {CFD} Problems",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap137.pdf",
abstract = "A hybrid two-level parallelism using MPI/OpenMP is
implemented in the general-purpose spectral/hp element
CFD code NekTar to take advantage of the hierarchical
structures arising in deterministic and stochastic CFD
problems. We take a coarse grain approach to
shared-memory parallelism with OpenMP and employ a
workload-splitting scheme that can reduce the OpenMP
synchronizations to the minimum. The hybrid
implementation shows good scalability with respect to
both the problem size and the number of processors in
case of a fixed problem size. With the same number of
processors, the hybrid model with 2 (or 4) OpenMP
threads per MPI process is observed to perform better
than pure MPI and pure OpenMP on the NCSA SGI Origin
2000, while the pure MPI model performs the best on the
IBM SP3 at SDSC and on the Compaq Alpha cluster at PSC.
A key new result is that the use of threads facilitates
effectively prefinement, which is crucial to adaptive
discretization using high-order methods.",
acknowledgement = ack-nhfb,
}
@InProceedings{Tapus:2002:AHT,
author = "Cristian T{\u{a}}pu{\c{s}} and I-Hsin Chung and
Jeffrey K. Hollingsworth",
title = "{Active Harmony}: Towards Automated Performance
Tuning",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap138.pdf",
abstract = "In this paper, we present the Active Harmony automated
runtime tuning system. We describe the interface used
by programs to make applications tunable. We present
the Library Specification Layer which helps program
library developers expose multiple variations of the
same API using different algorithms. The Library
Specification Language helps to select the most
appropriate program library to tune the overall
performance. We also present the optimization algorithm
used to adjust parameters in the application and the
libraries. Finally, we present results that show how
the system is able to tune several real applications.
The automated tuning system is able to tune the
application parameters to within a few percent of the
best value after evaluating only 11 out of over 1,700
possible configurations.",
acknowledgement = ack-nhfb,
}
@InProceedings{Rauber:2002:LSH,
author = "Thomas Rauber and Gudula R{\"u}nger",
title = "Library Support for Hierarchical Multi-Processor
Tasks",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap176.pdf",
abstract = "The paper considers the modular programming with
hierarchically structured multi-processor tasks on top
of SPMD tasks for distributed memory machines. The
parallel execution requires a corresponding
decomposition of the set of processors into a
hierarchical group structure onto which the tasks are
mapped. This results in a multi-level group SPMD
computation model with varying processor group
structures. The advantage of this kind of mixed task
and data parallelism is a potential to reduce the
communication overhead and to increase scalability. We
present a runtime library to support the coordination
of hierarchically structured multi-processor tasks. The
library exploits an extended parallel group SPMD
programming model and manages the entire task execution
including the dynamic hierarchy of processor groups.
The library is built on top of MPI, has an easy-to-use
interface, and leads to only a marginal overhead while
allowing static planning and dynamic restructuring.
Keywords: mixed task and data parallelism,
multiprocessor tasks, multilevel group SPMD,
hierarchical decomposition of processor sets, library
support, distributed memory",
acknowledgement = ack-nhfb,
}
@InProceedings{Frachtenberg:2002:SLF,
author = "Eitan Frachtenberg and Fabrizio Petrini and Juan
Fernandez and Salvador Coll and Scott Pakin",
title = "{STORM}: Lightning-Fast Resource Management",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap297.pdf",
abstract = "Although workstation clusters are a common platform
for high-performance computing (HPC), they remain more
difficult to manage than sequential systems or even
symmetric multiprocessors. Furthermore, as cluster
sizes increase, the quality of the resource-management
subsystem --- essentially, all of the code that runs on
a cluster other than the applications --- increasingly
impacts application efficiency. In this paper, we
present STORM, a resource-management framework designed
for scalability and performance. The key innovation
behind STORM is a software architecture that enables
resource management to exploit low-level network
features. As a result of this HPC-application-like
design, STORM is orders of magnitude faster than the
best reported results in the literature on two sample
resource-management functions: job launching and
process scheduling.",
acknowledgement = ack-nhfb,
}
@InProceedings{Colarelli:2002:MAI,
author = "Dennis Colarelli and Dirk Grunwald",
title = "Massive Arrays of Idle Disks For Storage Archives",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap312.pdf",
abstract = "The declining costs of commodity disk drives is
rapidly changing the economics of deploying large
amounts of online or near-line storage. Conventional
mass storage systems use either high performance RAID
clusters, automated tape libraries or a combination of
tape and disk. In this paper, we analyze an alternative
design using massive arrays of idle disks, or MAID. We
argue that this storage organization provides storage
densities matching or exceeding those of tape libraries
with performance similar to disk arrays. Moreover, we
show that with effective power management of individual
drives, this performance can be achieved using a very
small power budget. In particular, we show that our
power management strategy can result in the performance
comparable to an always-on RAID system while using
$1/15$th the power of such a RAID system.",
acknowledgement = ack-nhfb,
}
@InProceedings{Sterling:2002:GMP,
author = "Thomas L. Sterling and Hans P. Zima",
title = "{Gilgamesh}: {A} Multithreaded Processor-In-Memory
Architecture for Petaflops Computing",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap105.pdf",
abstract = "Processor-in-Memory (PIM) architectures avoid the von
Neumann bottleneck in conventional machines by
integrating high-density DRAM and CMOS logic on the
same chip. Parallel systems based on this new
technology are expected to provide higher scalability,
adaptability, robustness, fault tolerance and lower
power consumption than current MPPs or commodity
clusters. In this paper we describe the design of
Gilgamesh, a PIM-based massively parallel architecture,
and elements of its execution model. Gilgamesh extends
existing PIM capabilities by incorporating advanced
mechanisms for virtualizing tasks and data and
providing adaptive resource management for load
balancing and latency tolerance. The Gilgamesh
execution model is based on macroservers, a middleware
layer which supports object-based runtime management of
data and threads allowing explicit and dynamic control
of locality and load balancing. The paper concludes
with a discussion of related research activities and an
outlook to future work.",
acknowledgement = ack-nhfb,
}
@InProceedings{Acacio:2002:OPA,
author = "Manuel E. Acacio and Jose Gonzalez and Jose M. Garcia
and Jose Duato",
title = "Owner Prediction for Accelerating Cache-to-Cache
Transfer Misses in a cc-{NUMA} Architecture",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap102.pdf",
abstract = "Cache misses for which data must be obtained from a
remote cache (cache-to-cache transfer misses) account
for an important fraction of the total miss rate.
Unfortunately, cc-NUMA designs put the access to the
directory information into the critical path of 3-hop
misses, which significantly penalizes them compared to
SMP designs. This work studies the use of owner
prediction as a means of providing cc-NUMA
multiprocessors with a more efficient support for
cache-to-cache transfer misses. Our proposal comprises
an effective prediction scheme as well as a coherence
protocol designed to support the use of prediction.
Results indicate that owner prediction can
significantly reduce the latency of cache-to-cache
transfer misses, which translates into speed-ups on
application performance up to 12\%. In order to also
accelerate most of those 3-hop misses that are either
not predicted or mispredicted, the inclusion of a small
and fast directory cache in every node is evaluated,
leading to improvements up to 16\% on the final
performance.",
acknowledgement = ack-nhfb,
}
@InProceedings{Ishihara:2002:TDN,
author = "Mitsuo Yokokawa and Ken'ichi Itakura and Atsuya Uno
and Takashi Ishihara and Yukio Kaneda",
title = "{16.4 Tflops} Direct Numerical Simulation of
Turbulence by {Fourier} Spectral Method on the {Earth
Simulator}",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap273.pdf",
abstract = "The high-resolution direct numerical simulations
(DNSs) of incompressible turbulence with numbers of
grid points up to 40963 have been executed on the Earth
Simulator (ES). The DNSs are based on the Fourier
spectral method, so that the equation for mass
conservation is accurately solved. In DNS based on the
spectral method, most of the computation time is
consumed in calculating the three-dimensional (3D) Fast
Fourier Transform (FFT), which requires huge-scale
global data transfer and has been the major stumbling
block that has prevented truly high-performance
computing. By implementing new methods to efficiently
perform the 3D-FFT on the ES, we have achieved DNS at
16.4 Tflops on 20483 grid points. The DNS yields an
energy spectrum exhibiting a wide inertial subrange, in
contrast to previous DNSs with lower resolutions, and
therefore provides valuable data for the study of the
universal features of turbulence at large Reynolds
number.",
acknowledgement = ack-nhfb,
}
@InProceedings{Sakagami:2002:TTD,
author = "Hitoshi Sakagami and Hitoshi Murai and Yoshiki Seo and
Mitsuo Yokokawa",
title = "{14.9 TFLOPS} Three-dimensional Fluid Simulation for
Fusion Science with {HPF} on the {Earth Simulator}",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap147.pdf",
abstract = "We succeeded in getting 14.9 TFLOPS performance when
running a plasma simulation code IMPACT-3D parallelized
with High Performance Fortran on 512 nodes of the Earth
Simulator. The theoretical peak performance of the 512
nodes is 32 TFLOPS, which means 45\% of the peak
performance was obtained with HPF. IMPACT-3D is an
implosion analysis code using TVD scheme, which
performs three-dimensional compressible and inviscid
Eulerian fluid computation with the explicit 5-point
stencil scheme for spatial differentiation and the
fractional time step for time integration. The mesh
size is 2048x2048x4096, and the third dimension was
distributed for the parallelization. The HPF system
used in the evaluation is HPF/ES, developed for the
Earth Simulator by enhancing NEC HPF/SX V2 mainly in
communication scalability. Shift communications were
manually tuned to get best performance by using HPF/JA
extensions, which was designed to give the users more
control over sophisticated parallelization and
communication optimizations.",
acknowledgement = ack-nhfb,
}
@InProceedings{Shingu:2002:TGA,
author = "Satoru Shingu and Hiroshi Takahara and Hiromitsu
Fuchigami and Masayuki Yamada and Yoshinori Tsuda and
Wataru Ohfuchi and Yuji Sasaki and Kazuo Kobayashi and
Takashi Hagiwara and Shin-ichi Habata and Mitsuo
Yokokawa and Hiroyuki Itoh and Kiyoshi Otsuka",
title = "A {26.58 Tflops} Global Atmospheric Simulation with
the Spectral Transform Method on the {Earth
Simulator}",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap331.pdf",
abstract = "A spectral atmospheric general circulation model
called AFES (AGCM for Earth Simulator) was developed
and optimized for the architecture of the Earth
Simulator (ES). The ES is a massively parallel vector
supercomputer that consists of 640 processor nodes
interconnected by a single stage crossbar network with
its total peak performance of 40.96 Tflops. The
sustained performance of 26.58 Tflops was achieved for
a high resolution simulation (T1279L96) with AFES by
utilizing the full 640-node configuration of the ES.
The resulting computing efficiency is 64.9\% of the
peak performance, well surpassing that of conventional
weather/climate applications having just 25--50\%
efficiency even on vector parallel computers. This
remarkable performance proves the effectiveness of the
ES as a viable means for practical applications.",
acknowledgement = ack-nhfb,
}
@InProceedings{Noordergraaf:2002:SSI,
author = "Lisa Noordergraaf and Robert Zak",
title = "{SMP} System Interconnect Instrumentation for
Performance Analysis",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap158.pdf",
abstract = "The system interconnect is often the performance
bottleneck in SMP computers. Although modern SMPs
include event counters on processors and interconnects,
these provide limited information about the interaction
of processors vying for shared resources. Additionally,
transaction sources and addresses are not readily
available, making analysis of access patterns and data
locality difficult. Enhanced system interconnect
instrumentation is required to extract this
information.\par
This paper describes instrumentation implemented for
monitoring the system interconnect on Sun Fire\TM{}
servers. The instrumentation supports sophisticated
programmable filtering of event counters, allowing us
to construct histograms of system interconnect
activity, and a FIFO to capture trace sequences. Our
implementation results in a very small hardware
footprint, making it appropriate for inclusion in
commodity hardware.\par
We also describe a sampling of software tools and
results based on this infrastructure. Applications have
included performance profiling, architectural studies,
and hardware bringup and debugging.",
acknowledgement = ack-nhfb,
}
@InProceedings{Spencer:2002:EMP,
author = "Matthew Spencer and Renato Ferreira and Michael Beynon
and Tahsin Kurc and Umit Catalyurek and Alan Sussman
and Joel Saltz",
title = "Executing Multiple Pipelined Data Analysis Operations
in the Grid",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap258.pdf",
abstract = "Processing of data in many data analysis applications
can be represented as an acyclic, coarse grain data
flow, from data sources to the client. This paper is
concerned with scheduling of multiple data analysis
operations, each of which is represented as a pipelined
chain of processing on data. We define the scheduling
problem for effectively placing components onto Grid
resources, and propose two scheduling algorithms.
Experimental results are presented using a
visualization application.",
acknowledgement = ack-nhfb,
}
@InProceedings{Dail:2002:DSA,
author = "Holly Dail and Henri Casanova and Fran Berman",
title = "A Decoupled Scheduling Approach for the {GrADS}
Program Development Environment",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap229.pdf",
abstract = "Program development environments are instrumental in
providing users with easy and efficient access to
parallel computing platforms. While a number of such
environments have been widely accepted and used for
traditional HPC systems, there are currently no widely
used environments for Grid programming. The goal of the
Grid Application Development Software (GrADS) project
is to develop a coordinated set of tools, libraries and
run-time execution facilities for Grid program
development. In this paper, we describe a Grid
scheduler component that is integrated as part of the
GrADS software system. Traditionally, application-level
schedulers (e.g. AppLeS) have been tightly integrated
with the application itself and were not easily applied
to other applications. Our design is generic: we
decouple the scheduler core (the search procedure) from
the application-specific (e.g. application performance
models) and platform-specific (e.g. collection of
resource information) components used by the search
procedure. We provide experimental validation of our
approach for two representative regular, iterative
parallel programs in a variety of real-world Grid
testbeds. Our scheduler consistently outperforms static
and user-driven scheduling methods.",
acknowledgement = ack-nhfb,
}
@InProceedings{Annis:2002:ACV,
author = "James Annis and Yong Zhao and Jens Voeckler and
Michael Wilde and Steve Kent and Ian Foster",
title = "Applying {Chimera} Virtual Data Concepts to Cluster
Finding in the {Sloan Sky Survey}",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap299.pdf",
abstract = "In many scientific disciplines --- especially long
running, data-intensive collaborations --- it is
important to track all aspects of data capture,
production, transformation, and analysis. In principle,
one can then audit, validate, reproduce, and/or re-run
with corrections various data transformations. We have
recently proposed and prototyped the Chimera virtual
data system, a new database-driven approach to this
problem. We present here a major application study in
which we apply Chimera to a challenging data analysis
problem: the identification of galaxy clusters within
the Sloan Digital Sky Survey. We describe the problem,
its computational procedures, and the use of Chimera to
plan and orchestrate the workflow of thousands of tasks
on a data grid comprising hundreds of computers. This
experience suggests that a general set of tools can
indeed enhance the accuracy and productivity of
scientific data reduction and that further development
and application of this paradigm will offer great
value.",
acknowledgement = ack-nhfb,
}
@InProceedings{Andrade:2002:APG,
author = "Henrique Andrade and Tahsin Kurc and Alan Sussman and
Joel Saltz",
title = "{Active Proxy-G}: Optimizing the Query Execution
Process in the Grid",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap219.pdf",
abstract = "The Grid environment facilitates collaborative work
and allows many users to query and process data over
geographically dispersed data repositories. Over the
past several years, there has been a growing interest
in developing applications that interactively analyze
datasets, potentially in a collaborative setting. We
describe the Active Proxy-G service that is able to
cache query results, use those results for answering
new incoming queries, generate subqueries for the parts
of a query that cannot be produced from the cache, and
submit the subqueries for final processing at
application servers that store the raw datasets. We
present an experimental evaluation to illustrate the
effects of various design tradeoffs. We also show the
benefits that two real applications gain from using the
middleware.",
acknowledgement = ack-nhfb,
}
@InProceedings{Chervenak:2002:GFC,
author = "Ann Chervenak and Ewa Deelman and Ian Foster and
Leanne Guy and Wolfgang Hoschek and Adriana Iamnitchi
and Carl Kesselman and Peter Kunszt and Matei Ripeanu
and Bob Schwartzkopf and Heinz Stockinger and Kurt
Stockinger and Brian Tierney",
title = "{Giggle}: {A} Framework for Constructing Scalable
Replica Location Services",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap239.pdf",
abstract = "In wide area computing systems, it is often desirable
to create remote read-only copies (replicas) of files.
Replication can be used to reduce access latency,
improve data locality, and/or increase robustness,
scalability and performance for distributed
applications. We define a replica location service
(RLS) as a system that maintains and provides access to
information about the physical locations of copies. An
RLS typically functions as one component of a data grid
architecture. This paper makes the following
contributions. First, we characterize RLS requirements.
Next, we describe a parameterized architectural
framework, which we name Giggle (for GIGa-scale Global
Location Engine), within which a wide range of RLSs can
be defined. We define several concrete instantiations
of this framework with different performance
characteristics. Finally, we present initial
performance results for an RLS prototype, demonstrating
that RLS systems can be constructed that meet
performance goals.",
acknowledgement = ack-nhfb,
}
@InProceedings{Bland:2002:EEI,
author = "P. H. Worley and T. H. {Dunigan, Jr.} and M. R. Fahey
and J. B. {White III} and A. S. Bland",
title = "Early Evaluation of the {IBM p690}",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap196.pdf",
abstract = "Oak Ridge National Laboratory recently received 27
32-way IBM pSeries 690 SMP nodes. In this paper, we
describe our initial evaluation of the p690
architecture, focusing on the performance of benchmarks
and applications that are representative of the
expected production workload.",
acknowledgement = ack-nhfb,
}
@InProceedings{Adiga:2002:OBS,
author = "N. R. Adiga and G. Almasi and G. S. Almasi and Y.
Aridor and R. Barik and D. Beece and R. Bellofatto and
G. Bhanot and R. Bickford and M. Blumrich and
A. A. Bright and J. Brunheroto and C. Cacaval and
J. Casta{\~n}os and W. Chan and L. Ceze and P. Coteus
and S. Chatterjee and D. Chen and G. Chiu and
T. M. Cipolla and P. Crumley and K. M. Desai and
A. Deutsch and T. Domany and M. B. Dombrowa and
W. Donath and M. Eleftheriou and C. Erway and J. Esch
and B. Fitch and J. Gagliano and A. Gara and R. Garg
and R. Germain and M. E. Giampapa and B. Gopalsamy and
J. Gunnels and M. Gupta and F. Gustavson and S. Hall
and R. A. Haring and D. Heidel and P. Heidelberger and
L. M. Herger and D. Hoenicke and R. D. Jackson and
T. Jamal-Eddine and G. V. Kopcsay and E. Krevat and
M. P. Kurhekar and A. P. Lanzetta and D. Lieber and
L. K. Liu and M. Lu and M. Mendell and A. Misra and
Y. Moatti and L. Mok and J. E. Moreira and
B. J. Nathanson and M. Newton and M. Ohmacht and
A. Oliner and V. Pandit and R. B. Pudota and R. Rand
and R. Regan and B. Rubin and A. Ruehli and S. Rus and
R. K. Sahoo and A. Sanomiya and E. Schenfeld and
M. Sharma and E. Shmueli and S. Singh and P. Song and
V. Srinivasan and B. D. Steinmacher-Burow and
K. Strauss and C. Surovic and R. Swetz and T. Takken
and R. B. Tremaine and M. Tsao and A. R. Umamaheshwaran
and P. Verma and P. Vranas and T. J. C. Ward and
M. Wazlowski and W. Barrett and C. Engel and B. Drehmel
and B. Hilgart and D. Hill and F. Kasemkhani and
D. Krolak and C. T. Li and T. Liebsch and J. Marcella
and A. Muff and A. Okomo and M. Rouse and A. Schram and
M. Tubbs and G. Ulsh and C. Wait and J. Wittrup and
M. Bae and K. Dockser and L. Kissel and M. K. Seager
and J. S. Vetter and K. Yates",
title = "An Overview of the {BlueGene/L} Supercomputer",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap207.pdf",
abstract = "This paper gives an overview of the BlueGene/L
Supercomputer. This is a jointly funded research
partnership between IBM and the Lawrence Livermore
National Laboratory as part of the United States
Department of Energy ASCI Advanced Architecture
Research Program. Application performance and scaling
studies have recently been initiated with partners at a
number of academic and government institutions,
including the San Diego Supercomputer Center and the
California Institute of Technology. This massively
parallel system of 65,536 nodes is based on a new
architecture that exploits system-on-a-chip technology
to deliver target peak processing power of 360
teraFLOPS (trillion floating-point operations per
second). The machine is scheduled to be operational in
the 2004-2005 time frame, at price/performance and
power consumption/performance targets unobtainable with
conventional architectures.",
acknowledgement = ack-nhfb,
}
@InProceedings{Warren:2002:HDC,
author = "Michael S. Warren and Eric H. Weigle and Wu-Chun
Feng",
title = "High-Density Computing: {A} 240-Processor {Beowulf} in
One Cubic Meter",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap210.pdf",
abstract = "We present results from computations on Green Destiny,
a 240-processor Beowulf cluster which is contained
entirely within a single 19-inch wide 42U rack. The
cluster consists of 240 Transmeta TM5600 667-MHz CPUs
mounted on RLX Technologies motherboard blades. The
blades are mounted side-by-side in an RLX 3U rack-mount
chassis, which holds 24 blades. The overall cluster
contains 10 chassis and associated Fast and Gigabit
Ethernet switches. The system has a footprint of 0.5
meter2 (6 square feet), a volume of 0.85 meter3 (30
cubic feet) and a measured power dissipation under load
of 5200 watts (including network switches). We have
measured the performance of the cluster using a
gravitational treecode N-body simulation of galaxy
formation using 200 million particles, which sustained
an average of 38.9 Gflops on 212 nodes of the system.
We also present results from a three-dimensional
hydrodynamic simulation of a core-collapse supernova",
acknowledgement = ack-nhfb,
keywords = "Beowulf; cluster; blade server; RLX; Transmeta; code
morphing; VLIW; performance-per-square-foot;
MIPS-per-watt",
}
@InProceedings{Kim:2002:UDC,
author = "Seung Jo Kim and Joon-Seok Hwang and Chang Sung Lee
and Sangsan Lee",
title = "Utilization of Departmental Computing {GRID} System
for Development of an Artificial Intelligent Tapping
Inspection Method, Tapping Sound Analysis",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap167.pdf",
abstract = "Tapping Sound Analysis is a new NDE method, which
determines the existence of subsurface defects by
comparing the tapping sound of test structure and
original healthy structure. The tapping sound of
original healthy structure is named sound print of the
structure and is obtained through high precision
computation. Because many tapping points are required
to obtain the exact sound print data, many times of
tapping sound simulation are required. The simulation
of tapping sound requires complicated numerical
procedures. Departmental Computing GRID system was
utilized to run numerical simulations. Three cluster
systems and one PC-farm system comprise DCG system.
Tapping sound simulations were launched and monitored
through Globus and CONDOR. A total of 160 Tera
floating-point (double-precision) operations was
performed and the elapsed time was 41,880 sec. From the
numerical experiments, Grid computing technology
reduced the necessary time to make sound print database
and made TSA a feasible and practical methodology.",
acknowledgement = ack-nhfb,
}
@InProceedings{Kikuchi:2002:CSG,
author = "Hideaki Kikuchi and Rajiv K. Kalia and Aiichiro Nakano
and Priya Vashishta and Hiroshi Iyetomi and Shuji Ogata
and Takahisa Kouno and Fuyuki Shimojo and Kenji Tsuruta
and Subhash Saini",
title = "Collaborative Simulation Grid: Multiscale
Quantum-Mechanical\slash Classical Atomistic
Simulations on Distributed {PC} Clusters in the {US}
and {Japan}",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap111.pdf",
abstract = "A multidisciplinary, collaborative simulation has been
performed on a Grid of geographically distributed PC
clusters. The multiscale simulation approach seamlessly
combines (i) atomistic simulation based on the
molecular dynamics (MD) method and (ii) quantum
mechanical (QM) calculation based on the density
functional theory (DFT), so that accurate but less
scalable computations are performed only where they are
needed. The multiscale MD/QM simulation code has been
Grid-enabled using (i) a modular, additive
hybridization scheme, (ii) multiple QM clustering, and
(iii) computation/communication overlapping. The
Gridified MD/QM simulation code has been used to study
environmental effects of water molecules on fracture in
silicon. A preliminary run of the code has achieved a
parallel efficiency of 94\% on 25 PCs distributed over
3 PC clusters in the US and Japan, and a larger test
involving 154 processors on 5 distributed PC clusters
is in progress.",
acknowledgement = ack-nhfb,
keywords = "Grid application; multiscale simulation; molecular
dynamics;quantum mechanics; density functional theory",
}
@InProceedings{Baldridge:2002:QGI,
author = "Kim K. Baldridge and Jerry P. Greenberg and Stephen T.
Elbert and Stephen Mock and Philip Papadopoulos",
title = "{QMView} and {GAMESS}: Integration into the {World
Wide Computational Grid}",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap141.pdf",
abstract = "High performance computing, storage, visualization,
and database infrastructures are increasing
geometrically in complexity as scientists move towards
grid-based computing. While this is natural, it has the
effect of pushing computational capabilities beyond the
reach of scientists because of the time needed to
harness the infrastructure. Hiding the complexity of
networked resources becomes essential if scientists are
to utilize them effectively. In this work, we describe
our efforts to integrate various computational
chemistry components into a scientific computing
environment. We briefly describe improvements we have
made to individual components of the chemistry
environment as well as future directions, followed by a
more in-depth discussion of our strategy for
integration into a grid workflow environment based on
web services, which enables access to remote resources
while shielding users from the complexities of the grid
infrastructures. A preliminary schema for storing data
obtained from computational chemistry calculations is
also described.",
acknowledgement = ack-nhfb,
}
@InProceedings{Bustamante:2002:SDS,
author = "Fabian E. Bustamante and Patrick Widener and Karsten
Schwan",
title = "Scalable Directory Services Using Proactivity",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap202.pdf",
abstract = "Common to computational grids and pervasive computing
is the need for an expressive, efficient, and scalable
directory service that provides information about
objects in the environment. We argue that a directory
interface that `pushes' information to clients about
changes to objects can significantly improve
scalability. This paper describes the design,
implementation, and evaluation of the Proactive
Directory Service (PDS). PDS' interface supports a
customizable `proactive' mode through which clients can
subscribe to be notified about changes to their objects
of interest. Clients can dynamically tune the detail
and granularity of these notifications through filter
functions instantiated at the server or at the object's
owner, and by remotely tuning the functionality of
those filters. We compare PDS' performance against
off-the-shelf implementations of DNS and the
Lightweight Directory Access Protocol. Our evaluation
results confirm the expected performance advantages of
this approach and demonstrate that customized
notification through filter functions can reduce
bandwidth utilization while improving the performance
of both clients and directory servers.",
acknowledgement = ack-nhfb,
}
@InProceedings{Lee:2002:MDA,
author = "Jason Lee and Dan Gunter and Martin Stoufer and Brian
Tierney",
title = "Monitoring Data Archives for Grid Environments",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap234.pdf",
abstract = "Developers and users of high-performance distributed
systems often observe performance problems such as
unexpectedly low throughput or high latency. To
determine the source of these performance problems,
detailed end-to-end monitoring data from applications,
networks, operating systems, and hardware must be
correlated across time and space. Researchers need to
be able to view and compare this very detailed
monitoring data from a variety of angles. To address
this problem, we propose a relational monitoring data
archive that is designed to efficiently handle
high-volume streams of monitoring data. In this paper
we present an instrumentation and monitoring event
archive service that can be used to collect and
aggregate detailed end-to-end monitoring information
from distributed applications. This archive service is
designed to be scalable and fault tolerant. We also
show how the archive is based on the ``Grid Monitoring
Architecture''' defined by the Global Grid Forum.",
acknowledgement = ack-nhfb,
}
@InProceedings{Mazzucco:2002:MMD,
author = "Marco Mazzucco and Asvin Ananthanarayan and Robert L.
Grossman and Jorge Levera and Gokulnath B. Rao",
title = "Merging Multiple Data Streams on Common Keys Over High
Performance Networks",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap213.pdf",
abstract = "The model for data mining on streaming data assumes
that there is a buffer of fixed length and a data
stream of infinite length and the challenge is to
extract patterns, changes, anomalies, and statistically
significant structures by examining the data one time
and storing records and derived attributes of length
less than . As data grids, data webs, and semantic webs
become more common, mining distributed streaming data
will become more and more important. The first step
when presented with two or more distributed streams is
to merge them using a common key. In this paper, we
present two algorithms for merging streaming data using
a common key. We also present experimental studies
showing these algorithms scale in practice to OC-12
networks.",
acknowledgement = ack-nhfb,
}
%%% ====================================================================
%%% Cross-referenced entries must come last:
@Proceedings{IEEE:2002:STI,
editor = "{IEEE}",
booktitle = "{SC2002}: From Terabytes to Insight. Proceedings of
the {IEEE ACM SC 2002 Conference, November 16--22,
2002, Baltimore, MD, USA}",
title = "{SC2002}: From Terabytes to Insight. Proceedings of
the {IEEE ACM SC 2002 Conference, November 16--22,
2002, Baltimore, MD, USA}",
publisher = pub-IEEE,
address = pub-IEEE:adr,
pages = "????",
year = "2002",
ISBN = "0-7695-1524-X",
ISBN-13 = "978-0-7695-1524-3",
LCCN = "????",
bibdate = "Thu Feb 21 18:29:36 2002",
acknowledgement = ack-nhfb,
}